diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,44851 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9887464832760238, + "eval_steps": 1600, + "global_step": 6398, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00031259768677711783, + "grad_norm": 0.21875, + "learning_rate": 2e-05, + "loss": 2.2083, + "step": 1 + }, + { + "epoch": 0.00031259768677711783, + "eval_loss": 2.0116653442382812, + "eval_runtime": 1898.4475, + "eval_samples_per_second": 4.813, + "eval_steps_per_second": 2.407, + "step": 1 + }, + { + "epoch": 0.0006251953735542357, + "grad_norm": 0.216796875, + "learning_rate": 4e-05, + "loss": 2.0017, + "step": 2 + }, + { + "epoch": 0.0009377930603313535, + "grad_norm": 0.216796875, + "learning_rate": 6e-05, + "loss": 2.2668, + "step": 3 + }, + { + "epoch": 0.0012503907471084713, + "grad_norm": 0.220703125, + "learning_rate": 8e-05, + "loss": 1.9291, + "step": 4 + }, + { + "epoch": 0.0015629884338855893, + "grad_norm": 0.22265625, + "learning_rate": 0.0001, + "loss": 1.8984, + "step": 5 + }, + { + "epoch": 0.001875586120662707, + "grad_norm": 0.1845703125, + "learning_rate": 0.00012, + "loss": 2.1924, + "step": 6 + }, + { + "epoch": 0.002188183807439825, + "grad_norm": 0.1982421875, + "learning_rate": 0.00014, + "loss": 2.135, + "step": 7 + }, + { + "epoch": 0.0025007814942169426, + "grad_norm": 0.1865234375, + "learning_rate": 0.00016, + "loss": 2.0434, + "step": 8 + }, + { + "epoch": 0.002813379180994061, + "grad_norm": 0.1826171875, + "learning_rate": 0.00018, + "loss": 1.892, + "step": 9 + }, + { + "epoch": 0.0031259768677711786, + "grad_norm": 0.1953125, + "learning_rate": 0.0002, + "loss": 2.0976, + "step": 10 + }, + { + "epoch": 0.0034385745545482964, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001999999969814363, + "loss": 2.257, + "step": 11 + }, + { + "epoch": 0.003751172241325414, + "grad_norm": 0.185546875, + "learning_rate": 0.00019999998792574533, + "loss": 2.0452, + "step": 12 + }, + { + "epoch": 0.004063769928102532, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019999997283292765, + "loss": 1.9728, + "step": 13 + }, + { + "epoch": 0.00437636761487965, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999999517029842, + "loss": 2.0065, + "step": 14 + }, + { + "epoch": 0.004688965301656768, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019999992453591622, + "loss": 2.0628, + "step": 15 + }, + { + "epoch": 0.005001562988433885, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019999989133172538, + "loss": 2.0292, + "step": 16 + }, + { + "epoch": 0.0053141606752110035, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999985209041366, + "loss": 1.9125, + "step": 17 + }, + { + "epoch": 0.005626758361988122, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999980681198345, + "loss": 1.8677, + "step": 18 + }, + { + "epoch": 0.005939356048765239, + "grad_norm": 0.16796875, + "learning_rate": 0.00019999975549643746, + "loss": 1.984, + "step": 19 + }, + { + "epoch": 0.006251953735542357, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999969814377878, + "loss": 1.9886, + "step": 20 + }, + { + "epoch": 0.006564551422319475, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001999996347540109, + "loss": 2.0277, + "step": 21 + }, + { + "epoch": 0.006877149109096593, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001999995653271377, + "loss": 2.0216, + "step": 22 + }, + { + "epoch": 0.00718974679587371, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999948986316324, + "loss": 1.8909, + "step": 23 + }, + { + "epoch": 0.007502344482650828, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999940836209215, + "loss": 1.9551, + "step": 24 + }, + { + "epoch": 0.007814942169427946, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019999932082392937, + "loss": 1.9186, + "step": 25 + }, + { + "epoch": 0.008127539856205065, + "grad_norm": 0.18359375, + "learning_rate": 0.00019999922724868015, + "loss": 2.076, + "step": 26 + }, + { + "epoch": 0.008440137542982182, + "grad_norm": 0.181640625, + "learning_rate": 0.00019999912763635016, + "loss": 1.9682, + "step": 27 + }, + { + "epoch": 0.0087527352297593, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019999902198694543, + "loss": 1.8056, + "step": 28 + }, + { + "epoch": 0.009065332916536417, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019999891030047227, + "loss": 1.8404, + "step": 29 + }, + { + "epoch": 0.009377930603313536, + "grad_norm": 0.16015625, + "learning_rate": 0.0001999987925769375, + "loss": 2.113, + "step": 30 + }, + { + "epoch": 0.009690528290090653, + "grad_norm": 0.166015625, + "learning_rate": 0.00019999866881634815, + "loss": 1.8715, + "step": 31 + }, + { + "epoch": 0.01000312597686777, + "grad_norm": 0.8359375, + "learning_rate": 0.00019999853901871175, + "loss": 3.09, + "step": 32 + }, + { + "epoch": 0.01031572366364489, + "grad_norm": 0.1640625, + "learning_rate": 0.00019999840318403613, + "loss": 2.1366, + "step": 33 + }, + { + "epoch": 0.010628321350422007, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019999826131232947, + "loss": 1.9007, + "step": 34 + }, + { + "epoch": 0.010940919037199124, + "grad_norm": 0.173828125, + "learning_rate": 0.00019999811340360034, + "loss": 1.9831, + "step": 35 + }, + { + "epoch": 0.011253516723976243, + "grad_norm": 0.1611328125, + "learning_rate": 0.0001999979594578577, + "loss": 1.9656, + "step": 36 + }, + { + "epoch": 0.01156611441075336, + "grad_norm": 0.48828125, + "learning_rate": 0.0001999977994751108, + "loss": 2.8559, + "step": 37 + }, + { + "epoch": 0.011878712097530478, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999763345536934, + "loss": 2.0553, + "step": 38 + }, + { + "epoch": 0.012191309784307595, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999974613986433, + "loss": 1.8912, + "step": 39 + }, + { + "epoch": 0.012503907471084715, + "grad_norm": 0.169921875, + "learning_rate": 0.0001999972833049431, + "loss": 1.8342, + "step": 40 + }, + { + "epoch": 0.012816505157861832, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999709917427946, + "loss": 1.9032, + "step": 41 + }, + { + "epoch": 0.01312910284463895, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019999690900666353, + "loss": 1.9228, + "step": 42 + }, + { + "epoch": 0.013441700531416068, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019999671280210676, + "loss": 1.9666, + "step": 43 + }, + { + "epoch": 0.013754298218193186, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019999651056062102, + "loss": 1.901, + "step": 44 + }, + { + "epoch": 0.014066895904970303, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019999630228221852, + "loss": 2.1406, + "step": 45 + }, + { + "epoch": 0.01437949359174742, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999960879669118, + "loss": 1.9389, + "step": 46 + }, + { + "epoch": 0.01469209127852454, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999586761471384, + "loss": 1.9625, + "step": 47 + }, + { + "epoch": 0.015004688965301657, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019999564122563795, + "loss": 2.0456, + "step": 48 + }, + { + "epoch": 0.015317286652078774, + "grad_norm": 0.171875, + "learning_rate": 0.00019999540879969775, + "loss": 1.9955, + "step": 49 + }, + { + "epoch": 0.01562988433885589, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999517033690727, + "loss": 1.8969, + "step": 50 + }, + { + "epoch": 0.01594248202563301, + "grad_norm": 0.166015625, + "learning_rate": 0.00019999492583728097, + "loss": 1.8544, + "step": 51 + }, + { + "epoch": 0.01625507971241013, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999467530083356, + "loss": 1.779, + "step": 52 + }, + { + "epoch": 0.016567677399187245, + "grad_norm": 0.177734375, + "learning_rate": 0.00019999441872758017, + "loss": 1.8127, + "step": 53 + }, + { + "epoch": 0.016880275085964364, + "grad_norm": 0.1796875, + "learning_rate": 0.0001999941561175363, + "loss": 2.0516, + "step": 54 + }, + { + "epoch": 0.017192872772741483, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999938874707178, + "loss": 2.1641, + "step": 55 + }, + { + "epoch": 0.0175054704595186, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019999361278714092, + "loss": 2.0483, + "step": 56 + }, + { + "epoch": 0.017818068146295718, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019999333206682218, + "loss": 1.9931, + "step": 57 + }, + { + "epoch": 0.018130665833072834, + "grad_norm": 0.181640625, + "learning_rate": 0.00019999304530977856, + "loss": 1.9795, + "step": 58 + }, + { + "epoch": 0.018443263519849953, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019999275251602738, + "loss": 2.13, + "step": 59 + }, + { + "epoch": 0.018755861206627072, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999924536855863, + "loss": 2.0652, + "step": 60 + }, + { + "epoch": 0.019068458893404187, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019999214881847338, + "loss": 1.9731, + "step": 61 + }, + { + "epoch": 0.019381056580181306, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019999183791470702, + "loss": 1.9303, + "step": 62 + }, + { + "epoch": 0.019693654266958426, + "grad_norm": 0.169921875, + "learning_rate": 0.000199991520974306, + "loss": 1.9115, + "step": 63 + }, + { + "epoch": 0.02000625195373554, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999911979972894, + "loss": 2.1622, + "step": 64 + }, + { + "epoch": 0.02031884964051266, + "grad_norm": 0.169921875, + "learning_rate": 0.00019999086898367678, + "loss": 1.9662, + "step": 65 + }, + { + "epoch": 0.02063144732728978, + "grad_norm": 0.16796875, + "learning_rate": 0.00019999053393348796, + "loss": 1.8382, + "step": 66 + }, + { + "epoch": 0.020944045014066895, + "grad_norm": 0.17578125, + "learning_rate": 0.00019999019284674317, + "loss": 1.9147, + "step": 67 + }, + { + "epoch": 0.021256642700844014, + "grad_norm": 0.171875, + "learning_rate": 0.00019998984572346308, + "loss": 2.0712, + "step": 68 + }, + { + "epoch": 0.021569240387621133, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019998949256366854, + "loss": 2.0207, + "step": 69 + }, + { + "epoch": 0.02188183807439825, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998913336738094, + "loss": 2.1334, + "step": 70 + }, + { + "epoch": 0.022194435761175368, + "grad_norm": 0.7109375, + "learning_rate": 0.00019998876813462192, + "loss": 2.7085, + "step": 71 + }, + { + "epoch": 0.022507033447952487, + "grad_norm": 0.17578125, + "learning_rate": 0.00019998839686541356, + "loss": 1.7364, + "step": 72 + }, + { + "epoch": 0.022819631134729602, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999880195597783, + "loss": 1.9281, + "step": 73 + }, + { + "epoch": 0.02313222882150672, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019998763621773883, + "loss": 1.9648, + "step": 74 + }, + { + "epoch": 0.023444826508283837, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998724683931838, + "loss": 1.9874, + "step": 75 + }, + { + "epoch": 0.023757424195060956, + "grad_norm": 0.171875, + "learning_rate": 0.0001999868514245404, + "loss": 1.785, + "step": 76 + }, + { + "epoch": 0.024070021881838075, + "grad_norm": 0.181640625, + "learning_rate": 0.0001999864499734288, + "loss": 1.9094, + "step": 77 + }, + { + "epoch": 0.02438261956861519, + "grad_norm": 0.162109375, + "learning_rate": 0.00019998604248600777, + "loss": 1.9723, + "step": 78 + }, + { + "epoch": 0.02469521725539231, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019998562896230196, + "loss": 1.8739, + "step": 79 + }, + { + "epoch": 0.02500781494216943, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019998520940233636, + "loss": 1.936, + "step": 80 + }, + { + "epoch": 0.025320412628946545, + "grad_norm": 0.16015625, + "learning_rate": 0.0001999847838061362, + "loss": 1.8807, + "step": 81 + }, + { + "epoch": 0.025633010315723664, + "grad_norm": 0.173828125, + "learning_rate": 0.00019998435217372728, + "loss": 1.7412, + "step": 82 + }, + { + "epoch": 0.025945608002500783, + "grad_norm": 0.17578125, + "learning_rate": 0.00019998391450513556, + "loss": 1.8404, + "step": 83 + }, + { + "epoch": 0.0262582056892779, + "grad_norm": 0.169921875, + "learning_rate": 0.00019998347080038754, + "loss": 1.8108, + "step": 84 + }, + { + "epoch": 0.026570803376055017, + "grad_norm": 0.181640625, + "learning_rate": 0.00019998302105950994, + "loss": 2.0934, + "step": 85 + }, + { + "epoch": 0.026883401062832137, + "grad_norm": 0.19140625, + "learning_rate": 0.00019998256528252998, + "loss": 2.0021, + "step": 86 + }, + { + "epoch": 0.027195998749609252, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019998210346947515, + "loss": 1.9675, + "step": 87 + }, + { + "epoch": 0.02750859643638637, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019998163562037332, + "loss": 1.8488, + "step": 88 + }, + { + "epoch": 0.02782119412316349, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019998116173525272, + "loss": 1.9255, + "step": 89 + }, + { + "epoch": 0.028133791809940606, + "grad_norm": 0.16796875, + "learning_rate": 0.000199980681814142, + "loss": 2.1055, + "step": 90 + }, + { + "epoch": 0.028446389496717725, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999801958570701, + "loss": 2.1303, + "step": 91 + }, + { + "epoch": 0.02875898718349484, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019997970386406637, + "loss": 2.0517, + "step": 92 + }, + { + "epoch": 0.02907158487027196, + "grad_norm": 0.181640625, + "learning_rate": 0.00019997920583516053, + "loss": 1.8314, + "step": 93 + }, + { + "epoch": 0.02938418255704908, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001999787017703826, + "loss": 1.7953, + "step": 94 + }, + { + "epoch": 0.029696780243826194, + "grad_norm": 0.169921875, + "learning_rate": 0.00019997819166976308, + "loss": 1.8238, + "step": 95 + }, + { + "epoch": 0.030009377930603313, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999776755333327, + "loss": 1.8905, + "step": 96 + }, + { + "epoch": 0.030321975617380433, + "grad_norm": 0.169921875, + "learning_rate": 0.00019997715336112263, + "loss": 1.7594, + "step": 97 + }, + { + "epoch": 0.030634573304157548, + "grad_norm": 0.171875, + "learning_rate": 0.0001999766251531644, + "loss": 1.9648, + "step": 98 + }, + { + "epoch": 0.030947170990934667, + "grad_norm": 0.1669921875, + "learning_rate": 0.00019997609090948996, + "loss": 2.1577, + "step": 99 + }, + { + "epoch": 0.03125976867771178, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999755506301315, + "loss": 2.0563, + "step": 100 + }, + { + "epoch": 0.0315723663644889, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001999750043151216, + "loss": 1.8857, + "step": 101 + }, + { + "epoch": 0.03188496405126602, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019997445196449337, + "loss": 1.8832, + "step": 102 + }, + { + "epoch": 0.03219756173804314, + "grad_norm": 0.189453125, + "learning_rate": 0.00019997389357828, + "loss": 2.0352, + "step": 103 + }, + { + "epoch": 0.03251015942482026, + "grad_norm": 0.18359375, + "learning_rate": 0.00019997332915651532, + "loss": 2.0126, + "step": 104 + }, + { + "epoch": 0.03282275711159737, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019997275869923335, + "loss": 2.0201, + "step": 105 + }, + { + "epoch": 0.03313535479837449, + "grad_norm": 0.177734375, + "learning_rate": 0.00019997218220646853, + "loss": 2.3295, + "step": 106 + }, + { + "epoch": 0.03344795248515161, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999715996782557, + "loss": 1.8919, + "step": 107 + }, + { + "epoch": 0.03376055017192873, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019997101111462998, + "loss": 1.7797, + "step": 108 + }, + { + "epoch": 0.03407314785870585, + "grad_norm": 0.193359375, + "learning_rate": 0.00019997041651562695, + "loss": 1.8956, + "step": 109 + }, + { + "epoch": 0.03438574554548297, + "grad_norm": 0.1640625, + "learning_rate": 0.00019996981588128244, + "loss": 1.9683, + "step": 110 + }, + { + "epoch": 0.03469834323226008, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019996920921163278, + "loss": 1.727, + "step": 111 + }, + { + "epoch": 0.0350109409190372, + "grad_norm": 0.1630859375, + "learning_rate": 0.00019996859650671457, + "loss": 1.8966, + "step": 112 + }, + { + "epoch": 0.03532353860581432, + "grad_norm": 0.16796875, + "learning_rate": 0.0001999679777665648, + "loss": 1.8195, + "step": 113 + }, + { + "epoch": 0.035636136292591436, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999673529912208, + "loss": 1.8785, + "step": 114 + }, + { + "epoch": 0.035948733979368555, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999667221807203, + "loss": 1.9363, + "step": 115 + }, + { + "epoch": 0.03626133166614567, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019996608533510144, + "loss": 2.0314, + "step": 116 + }, + { + "epoch": 0.036573929352922786, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999654424544026, + "loss": 1.9304, + "step": 117 + }, + { + "epoch": 0.036886527039699905, + "grad_norm": 0.181640625, + "learning_rate": 0.0001999647935386626, + "loss": 2.1456, + "step": 118 + }, + { + "epoch": 0.037199124726477024, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001999641385879206, + "loss": 1.6728, + "step": 119 + }, + { + "epoch": 0.037511722413254144, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019996347760221624, + "loss": 1.8201, + "step": 120 + }, + { + "epoch": 0.03782432010003126, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001999628105815893, + "loss": 2.096, + "step": 121 + }, + { + "epoch": 0.038136917786808375, + "grad_norm": 0.171875, + "learning_rate": 0.0001999621375260801, + "loss": 1.9948, + "step": 122 + }, + { + "epoch": 0.038449515473585494, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999614584357293, + "loss": 1.9562, + "step": 123 + }, + { + "epoch": 0.03876211316036261, + "grad_norm": 0.173828125, + "learning_rate": 0.00019996077331057788, + "loss": 1.8452, + "step": 124 + }, + { + "epoch": 0.03907471084713973, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019996008215066716, + "loss": 1.7615, + "step": 125 + }, + { + "epoch": 0.03938730853391685, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019995938495603893, + "loss": 1.7628, + "step": 126 + }, + { + "epoch": 0.03969990622069397, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019995868172673523, + "loss": 2.0241, + "step": 127 + }, + { + "epoch": 0.04001250390747108, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019995797246279856, + "loss": 2.0807, + "step": 128 + }, + { + "epoch": 0.0403251015942482, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019995725716427169, + "loss": 1.8564, + "step": 129 + }, + { + "epoch": 0.04063769928102532, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019995653583119785, + "loss": 2.1278, + "step": 130 + }, + { + "epoch": 0.04095029696780244, + "grad_norm": 0.1689453125, + "learning_rate": 0.00019995580846362055, + "loss": 2.095, + "step": 131 + }, + { + "epoch": 0.04126289465457956, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019995507506158372, + "loss": 1.6848, + "step": 132 + }, + { + "epoch": 0.04157549234135667, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019995433562513163, + "loss": 1.8979, + "step": 133 + }, + { + "epoch": 0.04188809002813379, + "grad_norm": 0.427734375, + "learning_rate": 0.00019995359015430894, + "loss": 2.9492, + "step": 134 + }, + { + "epoch": 0.04220068771491091, + "grad_norm": 0.173828125, + "learning_rate": 0.0001999528386491606, + "loss": 1.8951, + "step": 135 + }, + { + "epoch": 0.04251328540168803, + "grad_norm": 0.17578125, + "learning_rate": 0.00019995208110973206, + "loss": 1.7656, + "step": 136 + }, + { + "epoch": 0.04282588308846515, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019995131753606902, + "loss": 2.0607, + "step": 137 + }, + { + "epoch": 0.043138480775242266, + "grad_norm": 0.1611328125, + "learning_rate": 0.00019995054792821754, + "loss": 1.6803, + "step": 138 + }, + { + "epoch": 0.04345107846201938, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019994977228622414, + "loss": 2.0165, + "step": 139 + }, + { + "epoch": 0.0437636761487965, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001999489906101356, + "loss": 1.9388, + "step": 140 + }, + { + "epoch": 0.044076273835573616, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019994820289999913, + "loss": 1.6209, + "step": 141 + }, + { + "epoch": 0.044388871522350735, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999474091558623, + "loss": 1.8157, + "step": 142 + }, + { + "epoch": 0.044701469209127855, + "grad_norm": 0.171875, + "learning_rate": 0.00019994660937777301, + "loss": 1.7581, + "step": 143 + }, + { + "epoch": 0.045014066895904974, + "grad_norm": 0.279296875, + "learning_rate": 0.00019994580356577957, + "loss": 2.6888, + "step": 144 + }, + { + "epoch": 0.045326664582682086, + "grad_norm": 0.1796875, + "learning_rate": 0.00019994499171993056, + "loss": 2.0103, + "step": 145 + }, + { + "epoch": 0.045639262269459205, + "grad_norm": 0.171875, + "learning_rate": 0.00019994417384027507, + "loss": 1.7455, + "step": 146 + }, + { + "epoch": 0.045951859956236324, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019994334992686245, + "loss": 1.9287, + "step": 147 + }, + { + "epoch": 0.04626445764301344, + "grad_norm": 0.1875, + "learning_rate": 0.00019994251997974241, + "loss": 1.8521, + "step": 148 + }, + { + "epoch": 0.04657705532979056, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019994168399896508, + "loss": 2.0915, + "step": 149 + }, + { + "epoch": 0.046889653016567674, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019994084198458097, + "loss": 2.0972, + "step": 150 + }, + { + "epoch": 0.04720225070334479, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019993999393664083, + "loss": 2.2031, + "step": 151 + }, + { + "epoch": 0.04751484839012191, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019993913985519592, + "loss": 1.8532, + "step": 152 + }, + { + "epoch": 0.04782744607689903, + "grad_norm": 0.18359375, + "learning_rate": 0.0001999382797402978, + "loss": 1.9278, + "step": 153 + }, + { + "epoch": 0.04814004376367615, + "grad_norm": 0.1796875, + "learning_rate": 0.00019993741359199834, + "loss": 1.6459, + "step": 154 + }, + { + "epoch": 0.04845264145045327, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999365414103499, + "loss": 1.8459, + "step": 155 + }, + { + "epoch": 0.04876523913723038, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999356631954051, + "loss": 1.9833, + "step": 156 + }, + { + "epoch": 0.0490778368240075, + "grad_norm": 0.177734375, + "learning_rate": 0.00019993477894721698, + "loss": 1.8361, + "step": 157 + }, + { + "epoch": 0.04939043451078462, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999338886658389, + "loss": 2.0878, + "step": 158 + }, + { + "epoch": 0.04970303219756174, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001999329923513246, + "loss": 1.9454, + "step": 159 + }, + { + "epoch": 0.05001562988433886, + "grad_norm": 0.177734375, + "learning_rate": 0.00019993209000372818, + "loss": 1.982, + "step": 160 + }, + { + "epoch": 0.05032822757111598, + "grad_norm": 0.1796875, + "learning_rate": 0.00019993118162310415, + "loss": 1.9192, + "step": 161 + }, + { + "epoch": 0.05064082525789309, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999302672095074, + "loss": 1.8865, + "step": 162 + }, + { + "epoch": 0.05095342294467021, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019992934676299302, + "loss": 1.6733, + "step": 163 + }, + { + "epoch": 0.05126602063144733, + "grad_norm": 0.169921875, + "learning_rate": 0.00019992842028361665, + "loss": 1.9374, + "step": 164 + }, + { + "epoch": 0.051578618318224446, + "grad_norm": 0.1953125, + "learning_rate": 0.0001999274877714342, + "loss": 1.9537, + "step": 165 + }, + { + "epoch": 0.051891216005001566, + "grad_norm": 0.17578125, + "learning_rate": 0.000199926549226502, + "loss": 1.8767, + "step": 166 + }, + { + "epoch": 0.05220381369177868, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019992560464887667, + "loss": 1.8994, + "step": 167 + }, + { + "epoch": 0.0525164113785558, + "grad_norm": 0.185546875, + "learning_rate": 0.00019992465403861524, + "loss": 1.7415, + "step": 168 + }, + { + "epoch": 0.052829009065332916, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019992369739577512, + "loss": 1.7688, + "step": 169 + }, + { + "epoch": 0.053141606752110035, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019992273472041404, + "loss": 1.7507, + "step": 170 + }, + { + "epoch": 0.053454204438887154, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019992176601259015, + "loss": 1.995, + "step": 171 + }, + { + "epoch": 0.05376680212566427, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019992079127236192, + "loss": 1.9025, + "step": 172 + }, + { + "epoch": 0.054079399812441385, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001999198104997882, + "loss": 1.7634, + "step": 173 + }, + { + "epoch": 0.054391997499218504, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019991882369492815, + "loss": 1.8371, + "step": 174 + }, + { + "epoch": 0.05470459518599562, + "grad_norm": 0.17578125, + "learning_rate": 0.0001999178308578414, + "loss": 1.7978, + "step": 175 + }, + { + "epoch": 0.05501719287277274, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001999168319885879, + "loss": 2.0066, + "step": 176 + }, + { + "epoch": 0.05532979055954986, + "grad_norm": 0.17578125, + "learning_rate": 0.00019991582708722792, + "loss": 1.6957, + "step": 177 + }, + { + "epoch": 0.05564238824632698, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001999148161538221, + "loss": 1.8989, + "step": 178 + }, + { + "epoch": 0.05595498593310409, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019991379918843155, + "loss": 2.0687, + "step": 179 + }, + { + "epoch": 0.05626758361988121, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019991277619111763, + "loss": 1.9398, + "step": 180 + }, + { + "epoch": 0.05658018130665833, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019991174716194203, + "loss": 1.7309, + "step": 181 + }, + { + "epoch": 0.05689277899343545, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019991071210096698, + "loss": 1.8865, + "step": 182 + }, + { + "epoch": 0.05720537668021257, + "grad_norm": 0.173828125, + "learning_rate": 0.00019990967100825491, + "loss": 1.8802, + "step": 183 + }, + { + "epoch": 0.05751797436698968, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001999086238838687, + "loss": 1.814, + "step": 184 + }, + { + "epoch": 0.0578305720537668, + "grad_norm": 0.16796875, + "learning_rate": 0.00019990757072787152, + "loss": 1.6507, + "step": 185 + }, + { + "epoch": 0.05814316974054392, + "grad_norm": 0.17578125, + "learning_rate": 0.000199906511540327, + "loss": 1.921, + "step": 186 + }, + { + "epoch": 0.05845576742732104, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001999054463212991, + "loss": 1.9151, + "step": 187 + }, + { + "epoch": 0.05876836511409816, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019990437507085202, + "loss": 2.0727, + "step": 188 + }, + { + "epoch": 0.05908096280087528, + "grad_norm": 0.17578125, + "learning_rate": 0.00019990329778905058, + "loss": 2.0359, + "step": 189 + }, + { + "epoch": 0.05939356048765239, + "grad_norm": 0.19921875, + "learning_rate": 0.00019990221447595968, + "loss": 1.9311, + "step": 190 + }, + { + "epoch": 0.05970615817442951, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019990112513164484, + "loss": 1.8018, + "step": 191 + }, + { + "epoch": 0.06001875586120663, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019990002975617174, + "loss": 1.9104, + "step": 192 + }, + { + "epoch": 0.060331353547983746, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019989892834960656, + "loss": 1.7227, + "step": 193 + }, + { + "epoch": 0.060643951234760865, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019989782091201573, + "loss": 1.7287, + "step": 194 + }, + { + "epoch": 0.06095654892153798, + "grad_norm": 0.181640625, + "learning_rate": 0.0001998967074434662, + "loss": 1.8525, + "step": 195 + }, + { + "epoch": 0.061269146608315096, + "grad_norm": 0.439453125, + "learning_rate": 0.00019989558794402515, + "loss": 2.4259, + "step": 196 + }, + { + "epoch": 0.061581744295092215, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001998944624137601, + "loss": 2.1134, + "step": 197 + }, + { + "epoch": 0.061894341981869334, + "grad_norm": 0.169921875, + "learning_rate": 0.0001998933308527391, + "loss": 1.9239, + "step": 198 + }, + { + "epoch": 0.06220693966864645, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001998921932610304, + "loss": 1.7292, + "step": 199 + }, + { + "epoch": 0.06251953735542357, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001998910496387027, + "loss": 1.7629, + "step": 200 + }, + { + "epoch": 0.06283213504220068, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019988989998582506, + "loss": 2.005, + "step": 201 + }, + { + "epoch": 0.0631447327289778, + "grad_norm": 0.173828125, + "learning_rate": 0.00019988874430246686, + "loss": 1.7605, + "step": 202 + }, + { + "epoch": 0.06345733041575492, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998875825886979, + "loss": 1.748, + "step": 203 + }, + { + "epoch": 0.06376992810253204, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019988641484458826, + "loss": 2.1037, + "step": 204 + }, + { + "epoch": 0.06408252578930916, + "grad_norm": 0.181640625, + "learning_rate": 0.00019988524107020846, + "loss": 1.9274, + "step": 205 + }, + { + "epoch": 0.06439512347608628, + "grad_norm": 0.173828125, + "learning_rate": 0.00019988406126562937, + "loss": 1.7823, + "step": 206 + }, + { + "epoch": 0.0647077211628634, + "grad_norm": 0.1796875, + "learning_rate": 0.00019988287543092225, + "loss": 2.06, + "step": 207 + }, + { + "epoch": 0.06502031884964052, + "grad_norm": 0.193359375, + "learning_rate": 0.00019988168356615865, + "loss": 1.9327, + "step": 208 + }, + { + "epoch": 0.06533291653641764, + "grad_norm": 0.17578125, + "learning_rate": 0.00019988048567141052, + "loss": 1.9889, + "step": 209 + }, + { + "epoch": 0.06564551422319474, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019987928174675023, + "loss": 1.6262, + "step": 210 + }, + { + "epoch": 0.06595811190997186, + "grad_norm": 0.173828125, + "learning_rate": 0.00019987807179225035, + "loss": 1.8805, + "step": 211 + }, + { + "epoch": 0.06627070959674898, + "grad_norm": 0.181640625, + "learning_rate": 0.00019987685580798403, + "loss": 1.7265, + "step": 212 + }, + { + "epoch": 0.0665833072835261, + "grad_norm": 0.17578125, + "learning_rate": 0.0001998756337940247, + "loss": 1.7049, + "step": 213 + }, + { + "epoch": 0.06689590497030322, + "grad_norm": 0.173828125, + "learning_rate": 0.00019987440575044602, + "loss": 1.7256, + "step": 214 + }, + { + "epoch": 0.06720850265708034, + "grad_norm": 0.1748046875, + "learning_rate": 0.00019987317167732222, + "loss": 1.9469, + "step": 215 + }, + { + "epoch": 0.06752110034385746, + "grad_norm": 0.177734375, + "learning_rate": 0.00019987193157472777, + "loss": 2.0254, + "step": 216 + }, + { + "epoch": 0.06783369803063458, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019987068544273756, + "loss": 2.1006, + "step": 217 + }, + { + "epoch": 0.0681462957174117, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019986943328142678, + "loss": 1.9486, + "step": 218 + }, + { + "epoch": 0.06845889340418881, + "grad_norm": 0.181640625, + "learning_rate": 0.00019986817509087107, + "loss": 1.9707, + "step": 219 + }, + { + "epoch": 0.06877149109096593, + "grad_norm": 0.169921875, + "learning_rate": 0.00019986691087114635, + "loss": 1.868, + "step": 220 + }, + { + "epoch": 0.06908408877774304, + "grad_norm": 0.181640625, + "learning_rate": 0.00019986564062232897, + "loss": 1.9028, + "step": 221 + }, + { + "epoch": 0.06939668646452016, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998643643444956, + "loss": 1.9136, + "step": 222 + }, + { + "epoch": 0.06970928415129728, + "grad_norm": 0.181640625, + "learning_rate": 0.0001998630820377233, + "loss": 1.8039, + "step": 223 + }, + { + "epoch": 0.0700218818380744, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019986179370208947, + "loss": 1.7326, + "step": 224 + }, + { + "epoch": 0.07033447952485151, + "grad_norm": 0.169921875, + "learning_rate": 0.0001998604993376719, + "loss": 1.7712, + "step": 225 + }, + { + "epoch": 0.07064707721162863, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019985919894454875, + "loss": 1.9061, + "step": 226 + }, + { + "epoch": 0.07095967489840575, + "grad_norm": 0.181640625, + "learning_rate": 0.00019985789252279846, + "loss": 1.8444, + "step": 227 + }, + { + "epoch": 0.07127227258518287, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001998565800725, + "loss": 2.1696, + "step": 228 + }, + { + "epoch": 0.07158487027195999, + "grad_norm": 0.19140625, + "learning_rate": 0.00019985526159373255, + "loss": 1.9888, + "step": 229 + }, + { + "epoch": 0.07189746795873711, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019985393708657568, + "loss": 2.018, + "step": 230 + }, + { + "epoch": 0.07221006564551423, + "grad_norm": 0.18359375, + "learning_rate": 0.0001998526065511094, + "loss": 1.7847, + "step": 231 + }, + { + "epoch": 0.07252266333229133, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019985126998741404, + "loss": 1.879, + "step": 232 + }, + { + "epoch": 0.07283526101906845, + "grad_norm": 0.177734375, + "learning_rate": 0.00019984992739557024, + "loss": 1.7065, + "step": 233 + }, + { + "epoch": 0.07314785870584557, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019984857877565907, + "loss": 1.7451, + "step": 234 + }, + { + "epoch": 0.07346045639262269, + "grad_norm": 0.173828125, + "learning_rate": 0.000199847224127762, + "loss": 1.8228, + "step": 235 + }, + { + "epoch": 0.07377305407939981, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019984586345196074, + "loss": 1.9904, + "step": 236 + }, + { + "epoch": 0.07408565176617693, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001998444967483375, + "loss": 1.8958, + "step": 237 + }, + { + "epoch": 0.07439824945295405, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019984312401697473, + "loss": 1.8913, + "step": 238 + }, + { + "epoch": 0.07471084713973117, + "grad_norm": 0.193359375, + "learning_rate": 0.00019984174525795536, + "loss": 1.9273, + "step": 239 + }, + { + "epoch": 0.07502344482650829, + "grad_norm": 0.189453125, + "learning_rate": 0.00019984036047136257, + "loss": 1.8831, + "step": 240 + }, + { + "epoch": 0.0753360425132854, + "grad_norm": 0.19140625, + "learning_rate": 0.00019983896965728001, + "loss": 1.9506, + "step": 241 + }, + { + "epoch": 0.07564864020006253, + "grad_norm": 0.173828125, + "learning_rate": 0.00019983757281579162, + "loss": 1.971, + "step": 242 + }, + { + "epoch": 0.07596123788683964, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019983616994698173, + "loss": 1.8156, + "step": 243 + }, + { + "epoch": 0.07627383557361675, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019983476105093505, + "loss": 1.9397, + "step": 244 + }, + { + "epoch": 0.07658643326039387, + "grad_norm": 0.177734375, + "learning_rate": 0.00019983334612773662, + "loss": 1.7567, + "step": 245 + }, + { + "epoch": 0.07689903094717099, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019983192517747186, + "loss": 1.8685, + "step": 246 + }, + { + "epoch": 0.0772116286339481, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019983049820022656, + "loss": 2.2285, + "step": 247 + }, + { + "epoch": 0.07752422632072523, + "grad_norm": 0.193359375, + "learning_rate": 0.00019982906519608687, + "loss": 1.9532, + "step": 248 + }, + { + "epoch": 0.07783682400750234, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001998276261651393, + "loss": 1.8775, + "step": 249 + }, + { + "epoch": 0.07814942169427946, + "grad_norm": 0.18359375, + "learning_rate": 0.00019982618110747074, + "loss": 1.892, + "step": 250 + }, + { + "epoch": 0.07846201938105658, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019982473002316838, + "loss": 2.2827, + "step": 251 + }, + { + "epoch": 0.0787746170678337, + "grad_norm": 0.185546875, + "learning_rate": 0.0001998232729123199, + "loss": 2.1452, + "step": 252 + }, + { + "epoch": 0.07908721475461082, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019982180977501322, + "loss": 1.7888, + "step": 253 + }, + { + "epoch": 0.07939981244138794, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019982034061133666, + "loss": 1.7486, + "step": 254 + }, + { + "epoch": 0.07971241012816505, + "grad_norm": 0.1728515625, + "learning_rate": 0.00019981886542137892, + "loss": 1.8143, + "step": 255 + }, + { + "epoch": 0.08002500781494216, + "grad_norm": 0.18359375, + "learning_rate": 0.00019981738420522913, + "loss": 1.839, + "step": 256 + }, + { + "epoch": 0.08033760550171928, + "grad_norm": 0.169921875, + "learning_rate": 0.00019981589696297663, + "loss": 1.918, + "step": 257 + }, + { + "epoch": 0.0806502031884964, + "grad_norm": 0.19140625, + "learning_rate": 0.00019981440369471124, + "loss": 1.9144, + "step": 258 + }, + { + "epoch": 0.08096280087527352, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019981290440052306, + "loss": 1.7846, + "step": 259 + }, + { + "epoch": 0.08127539856205064, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001998113990805027, + "loss": 1.9837, + "step": 260 + }, + { + "epoch": 0.08158799624882776, + "grad_norm": 0.1875, + "learning_rate": 0.00019980988773474098, + "loss": 1.9422, + "step": 261 + }, + { + "epoch": 0.08190059393560488, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019980837036332917, + "loss": 1.7637, + "step": 262 + }, + { + "epoch": 0.082213191622382, + "grad_norm": 0.67578125, + "learning_rate": 0.0001998068469663588, + "loss": 2.5924, + "step": 263 + }, + { + "epoch": 0.08252578930915912, + "grad_norm": 0.185546875, + "learning_rate": 0.0001998053175439219, + "loss": 1.8041, + "step": 264 + }, + { + "epoch": 0.08283838699593624, + "grad_norm": 0.19921875, + "learning_rate": 0.00019980378209611083, + "loss": 2.139, + "step": 265 + }, + { + "epoch": 0.08315098468271334, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001998022406230182, + "loss": 1.8233, + "step": 266 + }, + { + "epoch": 0.08346358236949046, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001998006931247372, + "loss": 1.9227, + "step": 267 + }, + { + "epoch": 0.08377618005626758, + "grad_norm": 0.19140625, + "learning_rate": 0.00019979913960136114, + "loss": 1.7389, + "step": 268 + }, + { + "epoch": 0.0840887777430447, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019979758005298385, + "loss": 1.6342, + "step": 269 + }, + { + "epoch": 0.08440137542982182, + "grad_norm": 0.181640625, + "learning_rate": 0.0001997960144796995, + "loss": 1.9472, + "step": 270 + }, + { + "epoch": 0.08471397311659894, + "grad_norm": 0.18359375, + "learning_rate": 0.00019979444288160253, + "loss": 1.7985, + "step": 271 + }, + { + "epoch": 0.08502657080337606, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019979286525878792, + "loss": 1.8546, + "step": 272 + }, + { + "epoch": 0.08533916849015317, + "grad_norm": 0.19140625, + "learning_rate": 0.00019979128161135083, + "loss": 1.9697, + "step": 273 + }, + { + "epoch": 0.0856517661769303, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019978969193938694, + "loss": 2.095, + "step": 274 + }, + { + "epoch": 0.08596436386370741, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019978809624299218, + "loss": 1.9491, + "step": 275 + }, + { + "epoch": 0.08627696155048453, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019978649452226285, + "loss": 1.9463, + "step": 276 + }, + { + "epoch": 0.08658955923726164, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019978488677729574, + "loss": 1.8981, + "step": 277 + }, + { + "epoch": 0.08690215692403876, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019978327300818784, + "loss": 1.9126, + "step": 278 + }, + { + "epoch": 0.08721475461081588, + "grad_norm": 0.18359375, + "learning_rate": 0.0001997816532150366, + "loss": 1.8987, + "step": 279 + }, + { + "epoch": 0.087527352297593, + "grad_norm": 0.201171875, + "learning_rate": 0.00019978002739793978, + "loss": 1.7486, + "step": 280 + }, + { + "epoch": 0.08783994998437011, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019977839555699553, + "loss": 1.9603, + "step": 281 + }, + { + "epoch": 0.08815254767114723, + "grad_norm": 0.19140625, + "learning_rate": 0.00019977675769230246, + "loss": 1.8714, + "step": 282 + }, + { + "epoch": 0.08846514535792435, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019977511380395933, + "loss": 2.0087, + "step": 283 + }, + { + "epoch": 0.08877774304470147, + "grad_norm": 0.177734375, + "learning_rate": 0.00019977346389206545, + "loss": 2.1653, + "step": 284 + }, + { + "epoch": 0.08909034073147859, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019977180795672044, + "loss": 2.0311, + "step": 285 + }, + { + "epoch": 0.08940293841825571, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019977014599802418, + "loss": 1.8212, + "step": 286 + }, + { + "epoch": 0.08971553610503283, + "grad_norm": 0.193359375, + "learning_rate": 0.00019976847801607712, + "loss": 2.0245, + "step": 287 + }, + { + "epoch": 0.09002813379180995, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001997668040109799, + "loss": 1.8573, + "step": 288 + }, + { + "epoch": 0.09034073147858705, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019976512398283357, + "loss": 1.7208, + "step": 289 + }, + { + "epoch": 0.09065332916536417, + "grad_norm": 0.181640625, + "learning_rate": 0.00019976343793173958, + "loss": 1.7056, + "step": 290 + }, + { + "epoch": 0.09096592685214129, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019976174585779972, + "loss": 1.8874, + "step": 291 + }, + { + "epoch": 0.09127852453891841, + "grad_norm": 0.181640625, + "learning_rate": 0.00019976004776111613, + "loss": 1.5886, + "step": 292 + }, + { + "epoch": 0.09159112222569553, + "grad_norm": 0.181640625, + "learning_rate": 0.00019975834364179134, + "loss": 1.7725, + "step": 293 + }, + { + "epoch": 0.09190371991247265, + "grad_norm": 0.189453125, + "learning_rate": 0.0001997566334999282, + "loss": 1.7855, + "step": 294 + }, + { + "epoch": 0.09221631759924977, + "grad_norm": 0.1875, + "learning_rate": 0.00019975491733563, + "loss": 1.7919, + "step": 295 + }, + { + "epoch": 0.09252891528602689, + "grad_norm": 0.185546875, + "learning_rate": 0.00019975319514900028, + "loss": 1.7353, + "step": 296 + }, + { + "epoch": 0.092841512972804, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019975146694014312, + "loss": 1.8983, + "step": 297 + }, + { + "epoch": 0.09315411065958112, + "grad_norm": 0.185546875, + "learning_rate": 0.00019974973270916273, + "loss": 2.115, + "step": 298 + }, + { + "epoch": 0.09346670834635824, + "grad_norm": 0.177734375, + "learning_rate": 0.00019974799245616387, + "loss": 1.9605, + "step": 299 + }, + { + "epoch": 0.09377930603313535, + "grad_norm": 0.1953125, + "learning_rate": 0.0001997462461812516, + "loss": 1.9963, + "step": 300 + }, + { + "epoch": 0.09409190371991247, + "grad_norm": 0.189453125, + "learning_rate": 0.00019974449388453135, + "loss": 1.8288, + "step": 301 + }, + { + "epoch": 0.09440450140668959, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001997427355661089, + "loss": 1.7948, + "step": 302 + }, + { + "epoch": 0.0947170990934667, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001997409712260904, + "loss": 1.868, + "step": 303 + }, + { + "epoch": 0.09502969678024382, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019973920086458237, + "loss": 1.8929, + "step": 304 + }, + { + "epoch": 0.09534229446702094, + "grad_norm": 0.1796875, + "learning_rate": 0.00019973742448169165, + "loss": 1.6884, + "step": 305 + }, + { + "epoch": 0.09565489215379806, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019973564207752554, + "loss": 1.6901, + "step": 306 + }, + { + "epoch": 0.09596748984057518, + "grad_norm": 0.1875, + "learning_rate": 0.00019973385365219164, + "loss": 1.7943, + "step": 307 + }, + { + "epoch": 0.0962800875273523, + "grad_norm": 0.1875, + "learning_rate": 0.0001997320592057979, + "loss": 1.9581, + "step": 308 + }, + { + "epoch": 0.09659268521412942, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019973025873845263, + "loss": 1.6522, + "step": 309 + }, + { + "epoch": 0.09690528290090654, + "grad_norm": 0.189453125, + "learning_rate": 0.00019972845225026456, + "loss": 1.9327, + "step": 310 + }, + { + "epoch": 0.09721788058768364, + "grad_norm": 0.18359375, + "learning_rate": 0.00019972663974134275, + "loss": 1.9542, + "step": 311 + }, + { + "epoch": 0.09753047827446076, + "grad_norm": 0.189453125, + "learning_rate": 0.00019972482121179664, + "loss": 2.0571, + "step": 312 + }, + { + "epoch": 0.09784307596123788, + "grad_norm": 0.181640625, + "learning_rate": 0.00019972299666173594, + "loss": 2.2707, + "step": 313 + }, + { + "epoch": 0.098155673648015, + "grad_norm": 0.185546875, + "learning_rate": 0.0001997211660912709, + "loss": 1.9587, + "step": 314 + }, + { + "epoch": 0.09846827133479212, + "grad_norm": 0.189453125, + "learning_rate": 0.00019971932950051198, + "loss": 2.0126, + "step": 315 + }, + { + "epoch": 0.09878086902156924, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019971748688957003, + "loss": 1.7935, + "step": 316 + }, + { + "epoch": 0.09909346670834636, + "grad_norm": 0.18359375, + "learning_rate": 0.00019971563825855638, + "loss": 1.8761, + "step": 317 + }, + { + "epoch": 0.09940606439512348, + "grad_norm": 0.19921875, + "learning_rate": 0.00019971378360758254, + "loss": 2.2404, + "step": 318 + }, + { + "epoch": 0.0997186620819006, + "grad_norm": 0.177734375, + "learning_rate": 0.0001997119229367605, + "loss": 1.8394, + "step": 319 + }, + { + "epoch": 0.10003125976867772, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019971005624620265, + "loss": 1.8923, + "step": 320 + }, + { + "epoch": 0.10034385745545484, + "grad_norm": 0.1953125, + "learning_rate": 0.00019970818353602163, + "loss": 1.6077, + "step": 321 + }, + { + "epoch": 0.10065645514223195, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019970630480633047, + "loss": 1.8617, + "step": 322 + }, + { + "epoch": 0.10096905282900906, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001997044200572427, + "loss": 1.892, + "step": 323 + }, + { + "epoch": 0.10128165051578618, + "grad_norm": 0.181640625, + "learning_rate": 0.000199702529288872, + "loss": 1.7457, + "step": 324 + }, + { + "epoch": 0.1015942482025633, + "grad_norm": 0.173828125, + "learning_rate": 0.00019970063250133256, + "loss": 1.9309, + "step": 325 + }, + { + "epoch": 0.10190684588934042, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019969872969473888, + "loss": 1.905, + "step": 326 + }, + { + "epoch": 0.10221944357611754, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019969682086920585, + "loss": 1.697, + "step": 327 + }, + { + "epoch": 0.10253204126289465, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001996949060248487, + "loss": 1.8728, + "step": 328 + }, + { + "epoch": 0.10284463894967177, + "grad_norm": 0.1796875, + "learning_rate": 0.00019969298516178303, + "loss": 1.7783, + "step": 329 + }, + { + "epoch": 0.10315723663644889, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001996910582801248, + "loss": 1.8591, + "step": 330 + }, + { + "epoch": 0.10346983432322601, + "grad_norm": 0.181640625, + "learning_rate": 0.00019968912537999034, + "loss": 1.8009, + "step": 331 + }, + { + "epoch": 0.10378243201000313, + "grad_norm": 0.177734375, + "learning_rate": 0.00019968718646149635, + "loss": 1.6679, + "step": 332 + }, + { + "epoch": 0.10409502969678025, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019968524152475986, + "loss": 1.9598, + "step": 333 + }, + { + "epoch": 0.10440762738355736, + "grad_norm": 0.185546875, + "learning_rate": 0.00019968329056989836, + "loss": 1.7525, + "step": 334 + }, + { + "epoch": 0.10472022507033447, + "grad_norm": 0.1875, + "learning_rate": 0.00019968133359702956, + "loss": 1.9891, + "step": 335 + }, + { + "epoch": 0.1050328227571116, + "grad_norm": 0.27734375, + "learning_rate": 0.00019967937060627163, + "loss": 2.6398, + "step": 336 + }, + { + "epoch": 0.10534542044388871, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019967740159774304, + "loss": 1.8126, + "step": 337 + }, + { + "epoch": 0.10565801813066583, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001996754265715627, + "loss": 1.5844, + "step": 338 + }, + { + "epoch": 0.10597061581744295, + "grad_norm": 0.443359375, + "learning_rate": 0.00019967344552784987, + "loss": 2.6948, + "step": 339 + }, + { + "epoch": 0.10628321350422007, + "grad_norm": 0.1796875, + "learning_rate": 0.00019967145846672412, + "loss": 1.8124, + "step": 340 + }, + { + "epoch": 0.10659581119099719, + "grad_norm": 0.17578125, + "learning_rate": 0.00019966946538830537, + "loss": 1.7512, + "step": 341 + }, + { + "epoch": 0.10690840887777431, + "grad_norm": 0.203125, + "learning_rate": 0.00019966746629271402, + "loss": 1.886, + "step": 342 + }, + { + "epoch": 0.10722100656455143, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001996654611800707, + "loss": 1.8067, + "step": 343 + }, + { + "epoch": 0.10753360425132855, + "grad_norm": 0.185546875, + "learning_rate": 0.0001996634500504965, + "loss": 1.8013, + "step": 344 + }, + { + "epoch": 0.10784620193810565, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019966143290411282, + "loss": 1.701, + "step": 345 + }, + { + "epoch": 0.10815879962488277, + "grad_norm": 0.1953125, + "learning_rate": 0.00019965940974104145, + "loss": 1.6386, + "step": 346 + }, + { + "epoch": 0.10847139731165989, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001996573805614045, + "loss": 1.9652, + "step": 347 + }, + { + "epoch": 0.10878399499843701, + "grad_norm": 0.189453125, + "learning_rate": 0.0001996553453653245, + "loss": 1.8178, + "step": 348 + }, + { + "epoch": 0.10909659268521413, + "grad_norm": 0.177734375, + "learning_rate": 0.00019965330415292428, + "loss": 1.8802, + "step": 349 + }, + { + "epoch": 0.10940919037199125, + "grad_norm": 0.1767578125, + "learning_rate": 0.0001996512569243271, + "loss": 1.6879, + "step": 350 + }, + { + "epoch": 0.10972178805876837, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001996492036796566, + "loss": 1.8288, + "step": 351 + }, + { + "epoch": 0.11003438574554548, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019964714441903663, + "loss": 1.8453, + "step": 352 + }, + { + "epoch": 0.1103469834323226, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019964507914259157, + "loss": 1.8259, + "step": 353 + }, + { + "epoch": 0.11065958111909972, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019964300785044615, + "loss": 1.9748, + "step": 354 + }, + { + "epoch": 0.11097217880587684, + "grad_norm": 0.18359375, + "learning_rate": 0.00019964093054272535, + "loss": 2.0296, + "step": 355 + }, + { + "epoch": 0.11128477649265396, + "grad_norm": 0.1806640625, + "learning_rate": 0.0001996388472195546, + "loss": 2.1065, + "step": 356 + }, + { + "epoch": 0.11159737417943107, + "grad_norm": 0.19140625, + "learning_rate": 0.00019963675788105967, + "loss": 1.712, + "step": 357 + }, + { + "epoch": 0.11190997186620819, + "grad_norm": 0.173828125, + "learning_rate": 0.0001996346625273667, + "loss": 2.178, + "step": 358 + }, + { + "epoch": 0.1122225695529853, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019963256115860219, + "loss": 1.6854, + "step": 359 + }, + { + "epoch": 0.11253516723976242, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019963045377489297, + "loss": 1.7912, + "step": 360 + }, + { + "epoch": 0.11284776492653954, + "grad_norm": 0.181640625, + "learning_rate": 0.00019962834037636634, + "loss": 1.7385, + "step": 361 + }, + { + "epoch": 0.11316036261331666, + "grad_norm": 0.19140625, + "learning_rate": 0.00019962622096314983, + "loss": 1.787, + "step": 362 + }, + { + "epoch": 0.11347296030009378, + "grad_norm": 0.185546875, + "learning_rate": 0.00019962409553537141, + "loss": 1.7083, + "step": 363 + }, + { + "epoch": 0.1137855579868709, + "grad_norm": 0.177734375, + "learning_rate": 0.00019962196409315937, + "loss": 1.7489, + "step": 364 + }, + { + "epoch": 0.11409815567364802, + "grad_norm": 0.1875, + "learning_rate": 0.00019961982663664244, + "loss": 1.8184, + "step": 365 + }, + { + "epoch": 0.11441075336042514, + "grad_norm": 0.181640625, + "learning_rate": 0.0001996176831659496, + "loss": 1.924, + "step": 366 + }, + { + "epoch": 0.11472335104720226, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001996155336812103, + "loss": 2.1837, + "step": 367 + }, + { + "epoch": 0.11503594873397936, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019961337818255424, + "loss": 1.9305, + "step": 368 + }, + { + "epoch": 0.11534854642075648, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019961121667011166, + "loss": 1.9867, + "step": 369 + }, + { + "epoch": 0.1156611441075336, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019960904914401298, + "loss": 1.968, + "step": 370 + }, + { + "epoch": 0.11597374179431072, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019960687560438908, + "loss": 1.6922, + "step": 371 + }, + { + "epoch": 0.11628633948108784, + "grad_norm": 0.169921875, + "learning_rate": 0.00019960469605137114, + "loss": 1.7978, + "step": 372 + }, + { + "epoch": 0.11659893716786496, + "grad_norm": 0.189453125, + "learning_rate": 0.0001996025104850908, + "loss": 1.8674, + "step": 373 + }, + { + "epoch": 0.11691153485464208, + "grad_norm": 0.1796875, + "learning_rate": 0.00019960031890567997, + "loss": 1.7445, + "step": 374 + }, + { + "epoch": 0.1172241325414192, + "grad_norm": 0.185546875, + "learning_rate": 0.00019959812131327095, + "loss": 1.7513, + "step": 375 + }, + { + "epoch": 0.11753673022819631, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019959591770799643, + "loss": 1.7463, + "step": 376 + }, + { + "epoch": 0.11784932791497343, + "grad_norm": 0.189453125, + "learning_rate": 0.00019959370808998945, + "loss": 1.6496, + "step": 377 + }, + { + "epoch": 0.11816192560175055, + "grad_norm": 0.18359375, + "learning_rate": 0.0001995914924593834, + "loss": 1.6407, + "step": 378 + }, + { + "epoch": 0.11847452328852766, + "grad_norm": 0.19140625, + "learning_rate": 0.00019958927081631205, + "loss": 1.9992, + "step": 379 + }, + { + "epoch": 0.11878712097530478, + "grad_norm": 0.1875, + "learning_rate": 0.0001995870431609095, + "loss": 1.7538, + "step": 380 + }, + { + "epoch": 0.1190997186620819, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019958480949331024, + "loss": 1.6851, + "step": 381 + }, + { + "epoch": 0.11941231634885902, + "grad_norm": 0.189453125, + "learning_rate": 0.00019958256981364916, + "loss": 1.7887, + "step": 382 + }, + { + "epoch": 0.11972491403563613, + "grad_norm": 0.181640625, + "learning_rate": 0.00019958032412206142, + "loss": 1.8162, + "step": 383 + }, + { + "epoch": 0.12003751172241325, + "grad_norm": 0.1875, + "learning_rate": 0.0001995780724186826, + "loss": 1.8541, + "step": 384 + }, + { + "epoch": 0.12035010940919037, + "grad_norm": 0.1875, + "learning_rate": 0.00019957581470364869, + "loss": 1.8194, + "step": 385 + }, + { + "epoch": 0.12066270709596749, + "grad_norm": 0.20703125, + "learning_rate": 0.0001995735509770959, + "loss": 1.7891, + "step": 386 + }, + { + "epoch": 0.12097530478274461, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019957128123916103, + "loss": 1.992, + "step": 387 + }, + { + "epoch": 0.12128790246952173, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019956900548998097, + "loss": 1.9259, + "step": 388 + }, + { + "epoch": 0.12160050015629885, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019956672372969315, + "loss": 2.0642, + "step": 389 + }, + { + "epoch": 0.12191309784307595, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001995644359584354, + "loss": 1.6211, + "step": 390 + }, + { + "epoch": 0.12222569552985307, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019956214217634575, + "loss": 1.7604, + "step": 391 + }, + { + "epoch": 0.12253829321663019, + "grad_norm": 0.177734375, + "learning_rate": 0.00019955984238356268, + "loss": 1.8761, + "step": 392 + }, + { + "epoch": 0.12285089090340731, + "grad_norm": 0.193359375, + "learning_rate": 0.0001995575365802251, + "loss": 2.0069, + "step": 393 + }, + { + "epoch": 0.12316348859018443, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001995552247664721, + "loss": 1.7372, + "step": 394 + }, + { + "epoch": 0.12347608627696155, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019955290694244338, + "loss": 1.8025, + "step": 395 + }, + { + "epoch": 0.12378868396373867, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019955058310827878, + "loss": 1.8633, + "step": 396 + }, + { + "epoch": 0.12410128165051579, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019954825326411863, + "loss": 1.9765, + "step": 397 + }, + { + "epoch": 0.1244138793372929, + "grad_norm": 0.197265625, + "learning_rate": 0.0001995459174101036, + "loss": 1.6959, + "step": 398 + }, + { + "epoch": 0.12472647702407003, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001995435755463746, + "loss": 1.6401, + "step": 399 + }, + { + "epoch": 0.12503907471084713, + "grad_norm": 0.185546875, + "learning_rate": 0.00019954122767307318, + "loss": 2.1424, + "step": 400 + }, + { + "epoch": 0.12535167239762426, + "grad_norm": 0.17578125, + "learning_rate": 0.00019953887379034094, + "loss": 1.9393, + "step": 401 + }, + { + "epoch": 0.12566427008440137, + "grad_norm": 0.193359375, + "learning_rate": 0.00019953651389832008, + "loss": 1.8414, + "step": 402 + }, + { + "epoch": 0.1259768677711785, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019953414799715304, + "loss": 1.9348, + "step": 403 + }, + { + "epoch": 0.1262894654579556, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019953177608698263, + "loss": 1.6774, + "step": 404 + }, + { + "epoch": 0.12660206314473274, + "grad_norm": 0.18359375, + "learning_rate": 0.00019952939816795205, + "loss": 1.9635, + "step": 405 + }, + { + "epoch": 0.12691466083150985, + "grad_norm": 0.189453125, + "learning_rate": 0.0001995270142402049, + "loss": 1.788, + "step": 406 + }, + { + "epoch": 0.12722725851828695, + "grad_norm": 0.177734375, + "learning_rate": 0.00019952462430388506, + "loss": 1.7256, + "step": 407 + }, + { + "epoch": 0.12753985620506408, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019952222835913682, + "loss": 1.8476, + "step": 408 + }, + { + "epoch": 0.1278524538918412, + "grad_norm": 0.19140625, + "learning_rate": 0.00019951982640610484, + "loss": 1.9212, + "step": 409 + }, + { + "epoch": 0.12816505157861832, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019951741844493413, + "loss": 1.807, + "step": 410 + }, + { + "epoch": 0.12847764926539543, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019951500447577003, + "loss": 1.6015, + "step": 411 + }, + { + "epoch": 0.12879024695217256, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019951258449875828, + "loss": 1.8802, + "step": 412 + }, + { + "epoch": 0.12910284463894967, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019951015851404504, + "loss": 1.9614, + "step": 413 + }, + { + "epoch": 0.1294154423257268, + "grad_norm": 0.197265625, + "learning_rate": 0.0001995077265217767, + "loss": 1.8907, + "step": 414 + }, + { + "epoch": 0.1297280400125039, + "grad_norm": 0.197265625, + "learning_rate": 0.00019950528852210014, + "loss": 1.8123, + "step": 415 + }, + { + "epoch": 0.13004063769928104, + "grad_norm": 0.18359375, + "learning_rate": 0.00019950284451516245, + "loss": 1.6966, + "step": 416 + }, + { + "epoch": 0.13035323538605814, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019950039450111127, + "loss": 2.0439, + "step": 417 + }, + { + "epoch": 0.13066583307283527, + "grad_norm": 0.185546875, + "learning_rate": 0.00019949793848009448, + "loss": 1.9781, + "step": 418 + }, + { + "epoch": 0.13097843075961238, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019949547645226035, + "loss": 1.9264, + "step": 419 + }, + { + "epoch": 0.13129102844638948, + "grad_norm": 0.197265625, + "learning_rate": 0.00019949300841775753, + "loss": 2.0297, + "step": 420 + }, + { + "epoch": 0.13160362613316662, + "grad_norm": 0.19140625, + "learning_rate": 0.000199490534376735, + "loss": 1.9136, + "step": 421 + }, + { + "epoch": 0.13191622381994372, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019948805432934213, + "loss": 1.8224, + "step": 422 + }, + { + "epoch": 0.13222882150672086, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019948556827572862, + "loss": 1.7871, + "step": 423 + }, + { + "epoch": 0.13254141919349796, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019948307621604457, + "loss": 1.7048, + "step": 424 + }, + { + "epoch": 0.1328540168802751, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019948057815044048, + "loss": 1.9041, + "step": 425 + }, + { + "epoch": 0.1331666145670522, + "grad_norm": 0.1796875, + "learning_rate": 0.0001994780740790671, + "loss": 1.7443, + "step": 426 + }, + { + "epoch": 0.13347921225382933, + "grad_norm": 0.189453125, + "learning_rate": 0.0001994755640020756, + "loss": 1.6474, + "step": 427 + }, + { + "epoch": 0.13379180994060644, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019947304791961758, + "loss": 1.8303, + "step": 428 + }, + { + "epoch": 0.13410440762738357, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019947052583184488, + "loss": 1.64, + "step": 429 + }, + { + "epoch": 0.13441700531416068, + "grad_norm": 0.189453125, + "learning_rate": 0.00019946799773890974, + "loss": 1.7586, + "step": 430 + }, + { + "epoch": 0.13472960300093778, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019946546364096488, + "loss": 1.8402, + "step": 431 + }, + { + "epoch": 0.13504220068771491, + "grad_norm": 0.64453125, + "learning_rate": 0.00019946292353816318, + "loss": 2.2409, + "step": 432 + }, + { + "epoch": 0.13535479837449202, + "grad_norm": 0.193359375, + "learning_rate": 0.0001994603774306581, + "loss": 1.8416, + "step": 433 + }, + { + "epoch": 0.13566739606126915, + "grad_norm": 0.181640625, + "learning_rate": 0.00019945782531860325, + "loss": 1.7372, + "step": 434 + }, + { + "epoch": 0.13597999374804626, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019945526720215273, + "loss": 1.9704, + "step": 435 + }, + { + "epoch": 0.1362925914348234, + "grad_norm": 0.185546875, + "learning_rate": 0.00019945270308146103, + "loss": 1.6651, + "step": 436 + }, + { + "epoch": 0.1366051891216005, + "grad_norm": 0.19921875, + "learning_rate": 0.00019945013295668288, + "loss": 1.7958, + "step": 437 + }, + { + "epoch": 0.13691778680837763, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001994475568279735, + "loss": 2.0826, + "step": 438 + }, + { + "epoch": 0.13723038449515473, + "grad_norm": 0.19140625, + "learning_rate": 0.00019944497469548837, + "loss": 1.8808, + "step": 439 + }, + { + "epoch": 0.13754298218193187, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019944238655938339, + "loss": 2.257, + "step": 440 + }, + { + "epoch": 0.13785557986870897, + "grad_norm": 0.1796875, + "learning_rate": 0.0001994397924198148, + "loss": 2.0791, + "step": 441 + }, + { + "epoch": 0.13816817755548608, + "grad_norm": 0.193359375, + "learning_rate": 0.00019943719227693928, + "loss": 1.8917, + "step": 442 + }, + { + "epoch": 0.1384807752422632, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001994345861309137, + "loss": 1.8261, + "step": 443 + }, + { + "epoch": 0.13879337292904032, + "grad_norm": 0.189453125, + "learning_rate": 0.00019943197398189546, + "loss": 1.626, + "step": 444 + }, + { + "epoch": 0.13910597061581745, + "grad_norm": 0.193359375, + "learning_rate": 0.00019942935583004223, + "loss": 1.7819, + "step": 445 + }, + { + "epoch": 0.13941856830259455, + "grad_norm": 0.19921875, + "learning_rate": 0.0001994267316755121, + "loss": 1.8149, + "step": 446 + }, + { + "epoch": 0.1397311659893717, + "grad_norm": 0.1796875, + "learning_rate": 0.00019942410151846347, + "loss": 1.9703, + "step": 447 + }, + { + "epoch": 0.1400437636761488, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019942146535905514, + "loss": 1.7519, + "step": 448 + }, + { + "epoch": 0.14035636136292592, + "grad_norm": 0.201171875, + "learning_rate": 0.00019941882319744625, + "loss": 1.8088, + "step": 449 + }, + { + "epoch": 0.14066895904970303, + "grad_norm": 0.1953125, + "learning_rate": 0.0001994161750337963, + "loss": 2.0352, + "step": 450 + }, + { + "epoch": 0.14098155673648016, + "grad_norm": 0.19921875, + "learning_rate": 0.0001994135208682652, + "loss": 1.7832, + "step": 451 + }, + { + "epoch": 0.14129415442325727, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019941086070101314, + "loss": 1.7351, + "step": 452 + }, + { + "epoch": 0.14160675211003437, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019940819453220074, + "loss": 1.9127, + "step": 453 + }, + { + "epoch": 0.1419193497968115, + "grad_norm": 0.478515625, + "learning_rate": 0.00019940552236198897, + "loss": 2.6953, + "step": 454 + }, + { + "epoch": 0.1422319474835886, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019940284419053914, + "loss": 2.0053, + "step": 455 + }, + { + "epoch": 0.14254454517036574, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019940016001801294, + "loss": 1.7283, + "step": 456 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001993974698445724, + "loss": 1.7655, + "step": 457 + }, + { + "epoch": 0.14316974054391998, + "grad_norm": 0.19921875, + "learning_rate": 0.00019939477367037994, + "loss": 1.8373, + "step": 458 + }, + { + "epoch": 0.1434823382306971, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019939207149559835, + "loss": 1.8626, + "step": 459 + }, + { + "epoch": 0.14379493591747422, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019938936332039077, + "loss": 1.6125, + "step": 460 + }, + { + "epoch": 0.14410753360425133, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019938664914492062, + "loss": 2.0307, + "step": 461 + }, + { + "epoch": 0.14442013129102846, + "grad_norm": 0.193359375, + "learning_rate": 0.00019938392896935183, + "loss": 1.84, + "step": 462 + }, + { + "epoch": 0.14473272897780556, + "grad_norm": 0.19921875, + "learning_rate": 0.0001993812027938486, + "loss": 1.9634, + "step": 463 + }, + { + "epoch": 0.14504532666458267, + "grad_norm": 0.1953125, + "learning_rate": 0.00019937847061857552, + "loss": 2.0152, + "step": 464 + }, + { + "epoch": 0.1453579243513598, + "grad_norm": 0.201171875, + "learning_rate": 0.00019937573244369753, + "loss": 1.8692, + "step": 465 + }, + { + "epoch": 0.1456705220381369, + "grad_norm": 0.19140625, + "learning_rate": 0.00019937298826937995, + "loss": 1.7805, + "step": 466 + }, + { + "epoch": 0.14598311972491404, + "grad_norm": 0.197265625, + "learning_rate": 0.00019937023809578843, + "loss": 1.9569, + "step": 467 + }, + { + "epoch": 0.14629571741169115, + "grad_norm": 0.1865234375, + "learning_rate": 0.000199367481923089, + "loss": 1.9791, + "step": 468 + }, + { + "epoch": 0.14660831509846828, + "grad_norm": 0.189453125, + "learning_rate": 0.00019936471975144805, + "loss": 1.7193, + "step": 469 + }, + { + "epoch": 0.14692091278524538, + "grad_norm": 0.19140625, + "learning_rate": 0.00019936195158103237, + "loss": 1.7506, + "step": 470 + }, + { + "epoch": 0.14723351047202252, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019935917741200902, + "loss": 1.9867, + "step": 471 + }, + { + "epoch": 0.14754610815879962, + "grad_norm": 0.1953125, + "learning_rate": 0.00019935639724454556, + "loss": 1.8894, + "step": 472 + }, + { + "epoch": 0.14785870584557675, + "grad_norm": 0.197265625, + "learning_rate": 0.00019935361107880977, + "loss": 1.7917, + "step": 473 + }, + { + "epoch": 0.14817130353235386, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019935081891496985, + "loss": 1.9643, + "step": 474 + }, + { + "epoch": 0.14848390121913096, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001993480207531944, + "loss": 1.6624, + "step": 475 + }, + { + "epoch": 0.1487964989059081, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019934521659365235, + "loss": 1.5768, + "step": 476 + }, + { + "epoch": 0.1491090965926852, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019934240643651298, + "loss": 1.8556, + "step": 477 + }, + { + "epoch": 0.14942169427946234, + "grad_norm": 0.189453125, + "learning_rate": 0.00019933959028194592, + "loss": 1.9329, + "step": 478 + }, + { + "epoch": 0.14973429196623944, + "grad_norm": 0.203125, + "learning_rate": 0.0001993367681301212, + "loss": 1.7054, + "step": 479 + }, + { + "epoch": 0.15004688965301657, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001993339399812092, + "loss": 1.8809, + "step": 480 + }, + { + "epoch": 0.15035948733979368, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001993311058353807, + "loss": 1.5983, + "step": 481 + }, + { + "epoch": 0.1506720850265708, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019932826569280673, + "loss": 1.7169, + "step": 482 + }, + { + "epoch": 0.15098468271334792, + "grad_norm": 0.1953125, + "learning_rate": 0.00019932541955365883, + "loss": 1.9345, + "step": 483 + }, + { + "epoch": 0.15129728040012505, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019932256741810874, + "loss": 2.1597, + "step": 484 + }, + { + "epoch": 0.15160987808690216, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001993197092863287, + "loss": 1.5661, + "step": 485 + }, + { + "epoch": 0.1519224757736793, + "grad_norm": 0.19140625, + "learning_rate": 0.0001993168451584912, + "loss": 1.8121, + "step": 486 + }, + { + "epoch": 0.1522350734604564, + "grad_norm": 0.18359375, + "learning_rate": 0.00019931397503476924, + "loss": 1.7365, + "step": 487 + }, + { + "epoch": 0.1525476711472335, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019931109891533605, + "loss": 1.6982, + "step": 488 + }, + { + "epoch": 0.15286026883401063, + "grad_norm": 0.189453125, + "learning_rate": 0.00019930821680036527, + "loss": 1.9638, + "step": 489 + }, + { + "epoch": 0.15317286652078774, + "grad_norm": 0.201171875, + "learning_rate": 0.00019930532869003086, + "loss": 2.1991, + "step": 490 + }, + { + "epoch": 0.15348546420756487, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019930243458450724, + "loss": 1.8095, + "step": 491 + }, + { + "epoch": 0.15379806189434198, + "grad_norm": 0.177734375, + "learning_rate": 0.0001992995344839691, + "loss": 1.9021, + "step": 492 + }, + { + "epoch": 0.1541106595811191, + "grad_norm": 0.19921875, + "learning_rate": 0.0001992966283885915, + "loss": 1.9448, + "step": 493 + }, + { + "epoch": 0.1544232572678962, + "grad_norm": 0.19921875, + "learning_rate": 0.00019929371629854992, + "loss": 1.9806, + "step": 494 + }, + { + "epoch": 0.15473585495467335, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001992907982140202, + "loss": 1.7495, + "step": 495 + }, + { + "epoch": 0.15504845264145045, + "grad_norm": 0.203125, + "learning_rate": 0.00019928787413517842, + "loss": 2.0022, + "step": 496 + }, + { + "epoch": 0.15536105032822758, + "grad_norm": 0.193359375, + "learning_rate": 0.00019928494406220115, + "loss": 1.7185, + "step": 497 + }, + { + "epoch": 0.1556736480150047, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019928200799526532, + "loss": 2.0288, + "step": 498 + }, + { + "epoch": 0.1559862457017818, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019927906593454812, + "loss": 1.7969, + "step": 499 + }, + { + "epoch": 0.15629884338855893, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001992761178802272, + "loss": 2.1816, + "step": 500 + }, + { + "epoch": 0.15661144107533603, + "grad_norm": 0.1953125, + "learning_rate": 0.00019927316383248054, + "loss": 1.8524, + "step": 501 + }, + { + "epoch": 0.15692403876211317, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019927020379148646, + "loss": 1.6543, + "step": 502 + }, + { + "epoch": 0.15723663644889027, + "grad_norm": 0.203125, + "learning_rate": 0.0001992672377574237, + "loss": 1.7662, + "step": 503 + }, + { + "epoch": 0.1575492341356674, + "grad_norm": 0.1953125, + "learning_rate": 0.0001992642657304713, + "loss": 1.8305, + "step": 504 + }, + { + "epoch": 0.1578618318224445, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019926128771080868, + "loss": 1.6887, + "step": 505 + }, + { + "epoch": 0.15817442950922164, + "grad_norm": 0.1953125, + "learning_rate": 0.00019925830369861564, + "loss": 1.9668, + "step": 506 + }, + { + "epoch": 0.15848702719599875, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019925531369407228, + "loss": 1.8739, + "step": 507 + }, + { + "epoch": 0.15879962488277588, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019925231769735917, + "loss": 1.8289, + "step": 508 + }, + { + "epoch": 0.15911222256955299, + "grad_norm": 0.185546875, + "learning_rate": 0.0001992493157086572, + "loss": 1.9057, + "step": 509 + }, + { + "epoch": 0.1594248202563301, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019924630772814753, + "loss": 1.8643, + "step": 510 + }, + { + "epoch": 0.15973741794310722, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019924329375601177, + "loss": 1.8911, + "step": 511 + }, + { + "epoch": 0.16005001562988433, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019924027379243192, + "loss": 1.6922, + "step": 512 + }, + { + "epoch": 0.16036261331666146, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001992372478375903, + "loss": 1.9621, + "step": 513 + }, + { + "epoch": 0.16067521100343857, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019923421589166954, + "loss": 1.8731, + "step": 514 + }, + { + "epoch": 0.1609878086902157, + "grad_norm": 0.201171875, + "learning_rate": 0.00019923117795485272, + "loss": 1.6659, + "step": 515 + }, + { + "epoch": 0.1613004063769928, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019922813402732325, + "loss": 1.9896, + "step": 516 + }, + { + "epoch": 0.16161300406376994, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019922508410926489, + "loss": 1.8087, + "step": 517 + }, + { + "epoch": 0.16192560175054704, + "grad_norm": 0.19921875, + "learning_rate": 0.00019922202820086171, + "loss": 2.0338, + "step": 518 + }, + { + "epoch": 0.16223819943732418, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019921896630229827, + "loss": 1.8984, + "step": 519 + }, + { + "epoch": 0.16255079712410128, + "grad_norm": 0.205078125, + "learning_rate": 0.0001992158984137594, + "loss": 1.7892, + "step": 520 + }, + { + "epoch": 0.1628633948108784, + "grad_norm": 0.19921875, + "learning_rate": 0.00019921282453543032, + "loss": 1.6763, + "step": 521 + }, + { + "epoch": 0.16317599249765552, + "grad_norm": 0.185546875, + "learning_rate": 0.0001992097446674966, + "loss": 1.8474, + "step": 522 + }, + { + "epoch": 0.16348859018443263, + "grad_norm": 0.193359375, + "learning_rate": 0.00019920665881014416, + "loss": 1.9876, + "step": 523 + }, + { + "epoch": 0.16380118787120976, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001992035669635593, + "loss": 1.7454, + "step": 524 + }, + { + "epoch": 0.16411378555798686, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001992004691279287, + "loss": 1.9164, + "step": 525 + }, + { + "epoch": 0.164426383244764, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019919736530343935, + "loss": 1.9096, + "step": 526 + }, + { + "epoch": 0.1647389809315411, + "grad_norm": 0.1953125, + "learning_rate": 0.00019919425549027865, + "loss": 1.9148, + "step": 527 + }, + { + "epoch": 0.16505157861831823, + "grad_norm": 0.1953125, + "learning_rate": 0.00019919113968863437, + "loss": 1.9967, + "step": 528 + }, + { + "epoch": 0.16536417630509534, + "grad_norm": 0.2109375, + "learning_rate": 0.00019918801789869453, + "loss": 1.9329, + "step": 529 + }, + { + "epoch": 0.16567677399187247, + "grad_norm": 0.19921875, + "learning_rate": 0.00019918489012064772, + "loss": 1.9399, + "step": 530 + }, + { + "epoch": 0.16598937167864958, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019918175635468265, + "loss": 1.9082, + "step": 531 + }, + { + "epoch": 0.16630196936542668, + "grad_norm": 0.193359375, + "learning_rate": 0.00019917861660098858, + "loss": 1.9138, + "step": 532 + }, + { + "epoch": 0.16661456705220382, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019917547085975505, + "loss": 1.7534, + "step": 533 + }, + { + "epoch": 0.16692716473898092, + "grad_norm": 0.181640625, + "learning_rate": 0.00019917231913117197, + "loss": 1.8574, + "step": 534 + }, + { + "epoch": 0.16723976242575805, + "grad_norm": 0.19921875, + "learning_rate": 0.0001991691614154296, + "loss": 1.7967, + "step": 535 + }, + { + "epoch": 0.16755236011253516, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019916599771271855, + "loss": 1.765, + "step": 536 + }, + { + "epoch": 0.1678649577993123, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019916282802322989, + "loss": 1.9999, + "step": 537 + }, + { + "epoch": 0.1681775554860894, + "grad_norm": 0.197265625, + "learning_rate": 0.00019915965234715491, + "loss": 1.9353, + "step": 538 + }, + { + "epoch": 0.16849015317286653, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019915647068468538, + "loss": 1.8003, + "step": 539 + }, + { + "epoch": 0.16880275085964364, + "grad_norm": 0.19921875, + "learning_rate": 0.00019915328303601334, + "loss": 2.1542, + "step": 540 + }, + { + "epoch": 0.16911534854642077, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019915008940133127, + "loss": 1.9446, + "step": 541 + }, + { + "epoch": 0.16942794623319787, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019914688978083192, + "loss": 2.0184, + "step": 542 + }, + { + "epoch": 0.16974054391997498, + "grad_norm": 0.1875, + "learning_rate": 0.00019914368417470852, + "loss": 1.8707, + "step": 543 + }, + { + "epoch": 0.1700531416067521, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019914047258315457, + "loss": 1.8503, + "step": 544 + }, + { + "epoch": 0.17036573929352922, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019913725500636393, + "loss": 1.9382, + "step": 545 + }, + { + "epoch": 0.17067833698030635, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019913403144453088, + "loss": 1.6436, + "step": 546 + }, + { + "epoch": 0.17099093466708346, + "grad_norm": 0.197265625, + "learning_rate": 0.00019913080189785002, + "loss": 2.0155, + "step": 547 + }, + { + "epoch": 0.1713035323538606, + "grad_norm": 0.1875, + "learning_rate": 0.00019912756636651638, + "loss": 1.9679, + "step": 548 + }, + { + "epoch": 0.1716161300406377, + "grad_norm": 0.197265625, + "learning_rate": 0.00019912432485072516, + "loss": 1.619, + "step": 549 + }, + { + "epoch": 0.17192872772741483, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001991210773506722, + "loss": 1.8251, + "step": 550 + }, + { + "epoch": 0.17224132541419193, + "grad_norm": 0.197265625, + "learning_rate": 0.00019911782386655341, + "loss": 1.9356, + "step": 551 + }, + { + "epoch": 0.17255392310096906, + "grad_norm": 0.193359375, + "learning_rate": 0.00019911456439856536, + "loss": 1.7967, + "step": 552 + }, + { + "epoch": 0.17286652078774617, + "grad_norm": 0.1953125, + "learning_rate": 0.00019911129894690475, + "loss": 1.7887, + "step": 553 + }, + { + "epoch": 0.17317911847452327, + "grad_norm": 0.201171875, + "learning_rate": 0.00019910802751176867, + "loss": 1.8225, + "step": 554 + }, + { + "epoch": 0.1734917161613004, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019910475009335472, + "loss": 1.7761, + "step": 555 + }, + { + "epoch": 0.1738043138480775, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001991014666918607, + "loss": 1.917, + "step": 556 + }, + { + "epoch": 0.17411691153485465, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019909817730748487, + "loss": 1.707, + "step": 557 + }, + { + "epoch": 0.17442950922163175, + "grad_norm": 0.1953125, + "learning_rate": 0.00019909488194042575, + "loss": 2.2473, + "step": 558 + }, + { + "epoch": 0.17474210690840888, + "grad_norm": 0.1953125, + "learning_rate": 0.00019909158059088235, + "loss": 1.5952, + "step": 559 + }, + { + "epoch": 0.175054704595186, + "grad_norm": 0.1923828125, + "learning_rate": 0.000199088273259054, + "loss": 1.6575, + "step": 560 + }, + { + "epoch": 0.17536730228196312, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019908495994514026, + "loss": 1.9749, + "step": 561 + }, + { + "epoch": 0.17567989996874023, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019908164064934126, + "loss": 1.681, + "step": 562 + }, + { + "epoch": 0.17599249765551736, + "grad_norm": 0.1806640625, + "learning_rate": 0.00019907831537185734, + "loss": 1.7532, + "step": 563 + }, + { + "epoch": 0.17630509534229447, + "grad_norm": 0.19140625, + "learning_rate": 0.00019907498411288925, + "loss": 2.0639, + "step": 564 + }, + { + "epoch": 0.1766176930290716, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019907164687263813, + "loss": 2.1285, + "step": 565 + }, + { + "epoch": 0.1769302907158487, + "grad_norm": 0.189453125, + "learning_rate": 0.00019906830365130546, + "loss": 1.7988, + "step": 566 + }, + { + "epoch": 0.1772428884026258, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019906495444909302, + "loss": 1.6593, + "step": 567 + }, + { + "epoch": 0.17755548608940294, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019906159926620306, + "loss": 1.8094, + "step": 568 + }, + { + "epoch": 0.17786808377618005, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019905823810283812, + "loss": 1.6249, + "step": 569 + }, + { + "epoch": 0.17818068146295718, + "grad_norm": 0.185546875, + "learning_rate": 0.0001990548709592011, + "loss": 1.6268, + "step": 570 + }, + { + "epoch": 0.17849327914973429, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019905149783549532, + "loss": 1.5067, + "step": 571 + }, + { + "epoch": 0.17880587683651142, + "grad_norm": 0.19140625, + "learning_rate": 0.00019904811873192437, + "loss": 1.7792, + "step": 572 + }, + { + "epoch": 0.17911847452328852, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001990447336486923, + "loss": 1.7893, + "step": 573 + }, + { + "epoch": 0.17943107221006566, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001990413425860034, + "loss": 1.7304, + "step": 574 + }, + { + "epoch": 0.17974366989684276, + "grad_norm": 0.193359375, + "learning_rate": 0.00019903794554406248, + "loss": 1.9092, + "step": 575 + }, + { + "epoch": 0.1800562675836199, + "grad_norm": 0.193359375, + "learning_rate": 0.00019903454252307454, + "loss": 1.6916, + "step": 576 + }, + { + "epoch": 0.180368865270397, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001990311335232451, + "loss": 1.7437, + "step": 577 + }, + { + "epoch": 0.1806814629571741, + "grad_norm": 0.203125, + "learning_rate": 0.00019902771854477994, + "loss": 1.7296, + "step": 578 + }, + { + "epoch": 0.18099406064395124, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001990242975878852, + "loss": 1.6257, + "step": 579 + }, + { + "epoch": 0.18130665833072834, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001990208706527674, + "loss": 1.6635, + "step": 580 + }, + { + "epoch": 0.18161925601750548, + "grad_norm": 0.208984375, + "learning_rate": 0.00019901743773963353, + "loss": 1.8428, + "step": 581 + }, + { + "epoch": 0.18193185370428258, + "grad_norm": 0.1953125, + "learning_rate": 0.00019901399884869072, + "loss": 1.7945, + "step": 582 + }, + { + "epoch": 0.18224445139105971, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019901055398014662, + "loss": 1.7858, + "step": 583 + }, + { + "epoch": 0.18255704907783682, + "grad_norm": 0.19921875, + "learning_rate": 0.0001990071031342092, + "loss": 1.62, + "step": 584 + }, + { + "epoch": 0.18286964676461395, + "grad_norm": 0.201171875, + "learning_rate": 0.00019900364631108682, + "loss": 1.8136, + "step": 585 + }, + { + "epoch": 0.18318224445139106, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019900018351098813, + "loss": 1.9074, + "step": 586 + }, + { + "epoch": 0.1834948421381682, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001989967147341222, + "loss": 1.8761, + "step": 587 + }, + { + "epoch": 0.1838074398249453, + "grad_norm": 0.19921875, + "learning_rate": 0.00019899323998069846, + "loss": 1.8516, + "step": 588 + }, + { + "epoch": 0.1841200375117224, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001989897592509267, + "loss": 1.7505, + "step": 589 + }, + { + "epoch": 0.18443263519849953, + "grad_norm": 0.189453125, + "learning_rate": 0.00019898627254501697, + "loss": 1.9066, + "step": 590 + }, + { + "epoch": 0.18474523288527664, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001989827798631799, + "loss": 1.926, + "step": 591 + }, + { + "epoch": 0.18505783057205377, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019897928120562623, + "loss": 1.9225, + "step": 592 + }, + { + "epoch": 0.18537042825883088, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019897577657256724, + "loss": 2.0965, + "step": 593 + }, + { + "epoch": 0.185683025945608, + "grad_norm": 0.20703125, + "learning_rate": 0.00019897226596421447, + "loss": 1.7195, + "step": 594 + }, + { + "epoch": 0.18599562363238512, + "grad_norm": 0.197265625, + "learning_rate": 0.00019896874938077992, + "loss": 1.8197, + "step": 595 + }, + { + "epoch": 0.18630822131916225, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001989652268224758, + "loss": 2.2171, + "step": 596 + }, + { + "epoch": 0.18662081900593935, + "grad_norm": 0.1875, + "learning_rate": 0.00019896169828951488, + "loss": 1.8195, + "step": 597 + }, + { + "epoch": 0.1869334166927165, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019895816378211008, + "loss": 1.6969, + "step": 598 + }, + { + "epoch": 0.1872460143794936, + "grad_norm": 0.19921875, + "learning_rate": 0.00019895462330047484, + "loss": 1.8099, + "step": 599 + }, + { + "epoch": 0.1875586120662707, + "grad_norm": 0.189453125, + "learning_rate": 0.00019895107684482293, + "loss": 1.7597, + "step": 600 + }, + { + "epoch": 0.18787120975304783, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019894752441536838, + "loss": 1.7928, + "step": 601 + }, + { + "epoch": 0.18818380743982493, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019894396601232567, + "loss": 1.7385, + "step": 602 + }, + { + "epoch": 0.18849640512660207, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001989404016359097, + "loss": 1.7216, + "step": 603 + }, + { + "epoch": 0.18880900281337917, + "grad_norm": 0.19140625, + "learning_rate": 0.00019893683128633557, + "loss": 1.749, + "step": 604 + }, + { + "epoch": 0.1891216005001563, + "grad_norm": 0.189453125, + "learning_rate": 0.00019893325496381884, + "loss": 1.8708, + "step": 605 + }, + { + "epoch": 0.1894341981869334, + "grad_norm": 0.197265625, + "learning_rate": 0.00019892967266857547, + "loss": 1.9852, + "step": 606 + }, + { + "epoch": 0.18974679587371054, + "grad_norm": 0.203125, + "learning_rate": 0.0001989260844008217, + "loss": 1.7595, + "step": 607 + }, + { + "epoch": 0.19005939356048765, + "grad_norm": 0.197265625, + "learning_rate": 0.00019892249016077412, + "loss": 1.7231, + "step": 608 + }, + { + "epoch": 0.19037199124726478, + "grad_norm": 0.212890625, + "learning_rate": 0.0001989188899486498, + "loss": 1.7735, + "step": 609 + }, + { + "epoch": 0.1906845889340419, + "grad_norm": 0.1953125, + "learning_rate": 0.00019891528376466598, + "loss": 1.8502, + "step": 610 + }, + { + "epoch": 0.190997186620819, + "grad_norm": 0.19921875, + "learning_rate": 0.00019891167160904046, + "loss": 1.8522, + "step": 611 + }, + { + "epoch": 0.19130978430759613, + "grad_norm": 0.19921875, + "learning_rate": 0.0001989080534819913, + "loss": 2.0308, + "step": 612 + }, + { + "epoch": 0.19162238199437323, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019890442938373686, + "loss": 1.7471, + "step": 613 + }, + { + "epoch": 0.19193497968115036, + "grad_norm": 0.1962890625, + "learning_rate": 0.000198900799314496, + "loss": 1.5426, + "step": 614 + }, + { + "epoch": 0.19224757736792747, + "grad_norm": 0.197265625, + "learning_rate": 0.0001988971632744879, + "loss": 2.0733, + "step": 615 + }, + { + "epoch": 0.1925601750547046, + "grad_norm": 0.193359375, + "learning_rate": 0.00019889352126393198, + "loss": 1.8229, + "step": 616 + }, + { + "epoch": 0.1928727727414817, + "grad_norm": 0.19921875, + "learning_rate": 0.00019888987328304817, + "loss": 1.9119, + "step": 617 + }, + { + "epoch": 0.19318537042825884, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001988862193320567, + "loss": 1.6569, + "step": 618 + }, + { + "epoch": 0.19349796811503595, + "grad_norm": 0.189453125, + "learning_rate": 0.00019888255941117816, + "loss": 2.0652, + "step": 619 + }, + { + "epoch": 0.19381056580181308, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001988788935206335, + "loss": 1.6115, + "step": 620 + }, + { + "epoch": 0.19412316348859018, + "grad_norm": 0.197265625, + "learning_rate": 0.00019887522166064402, + "loss": 1.6017, + "step": 621 + }, + { + "epoch": 0.1944357611753673, + "grad_norm": 0.1875, + "learning_rate": 0.00019887154383143143, + "loss": 1.9108, + "step": 622 + }, + { + "epoch": 0.19474835886214442, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019886786003321772, + "loss": 1.6372, + "step": 623 + }, + { + "epoch": 0.19506095654892153, + "grad_norm": 0.296875, + "learning_rate": 0.0001988641702662253, + "loss": 2.4569, + "step": 624 + }, + { + "epoch": 0.19537355423569866, + "grad_norm": 0.1875, + "learning_rate": 0.000198860474530677, + "loss": 1.6954, + "step": 625 + }, + { + "epoch": 0.19568615192247577, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019885677282679585, + "loss": 1.8825, + "step": 626 + }, + { + "epoch": 0.1959987496092529, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019885306515480533, + "loss": 1.7887, + "step": 627 + }, + { + "epoch": 0.19631134729603, + "grad_norm": 0.2109375, + "learning_rate": 0.00019884935151492933, + "loss": 1.8936, + "step": 628 + }, + { + "epoch": 0.19662394498280714, + "grad_norm": 0.19921875, + "learning_rate": 0.00019884563190739196, + "loss": 1.7583, + "step": 629 + }, + { + "epoch": 0.19693654266958424, + "grad_norm": 0.201171875, + "learning_rate": 0.0001988419063324179, + "loss": 1.898, + "step": 630 + }, + { + "epoch": 0.19724914035636137, + "grad_norm": 0.193359375, + "learning_rate": 0.0001988381747902319, + "loss": 2.0045, + "step": 631 + }, + { + "epoch": 0.19756173804313848, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019883443728105943, + "loss": 1.9453, + "step": 632 + }, + { + "epoch": 0.1978743357299156, + "grad_norm": 0.2001953125, + "learning_rate": 0.000198830693805126, + "loss": 1.8481, + "step": 633 + }, + { + "epoch": 0.19818693341669272, + "grad_norm": 0.203125, + "learning_rate": 0.00019882694436265764, + "loss": 1.8409, + "step": 634 + }, + { + "epoch": 0.19849953110346982, + "grad_norm": 0.208984375, + "learning_rate": 0.00019882318895388072, + "loss": 1.8232, + "step": 635 + }, + { + "epoch": 0.19881212879024696, + "grad_norm": 0.193359375, + "learning_rate": 0.00019881942757902197, + "loss": 1.7768, + "step": 636 + }, + { + "epoch": 0.19912472647702406, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001988156602383084, + "loss": 1.7056, + "step": 637 + }, + { + "epoch": 0.1994373241638012, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019881188693196756, + "loss": 1.5243, + "step": 638 + }, + { + "epoch": 0.1997499218505783, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019880810766022714, + "loss": 2.0564, + "step": 639 + }, + { + "epoch": 0.20006251953735543, + "grad_norm": 0.203125, + "learning_rate": 0.00019880432242331536, + "loss": 1.8789, + "step": 640 + }, + { + "epoch": 0.20037511722413254, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019880053122146073, + "loss": 1.8037, + "step": 641 + }, + { + "epoch": 0.20068771491090967, + "grad_norm": 0.205078125, + "learning_rate": 0.00019879673405489215, + "loss": 1.7692, + "step": 642 + }, + { + "epoch": 0.20100031259768678, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019879293092383882, + "loss": 1.7066, + "step": 643 + }, + { + "epoch": 0.2013129102844639, + "grad_norm": 0.197265625, + "learning_rate": 0.00019878912182853036, + "loss": 1.8715, + "step": 644 + }, + { + "epoch": 0.201625507971241, + "grad_norm": 0.19140625, + "learning_rate": 0.0001987853067691967, + "loss": 1.6647, + "step": 645 + }, + { + "epoch": 0.20193810565801812, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019878148574606824, + "loss": 1.6027, + "step": 646 + }, + { + "epoch": 0.20225070334479525, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019877765875937558, + "loss": 1.6788, + "step": 647 + }, + { + "epoch": 0.20256330103157236, + "grad_norm": 0.205078125, + "learning_rate": 0.00019877382580934977, + "loss": 1.7934, + "step": 648 + }, + { + "epoch": 0.2028758987183495, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019876998689622225, + "loss": 1.6556, + "step": 649 + }, + { + "epoch": 0.2031884964051266, + "grad_norm": 0.1953125, + "learning_rate": 0.00019876614202022475, + "loss": 1.7103, + "step": 650 + }, + { + "epoch": 0.20350109409190373, + "grad_norm": 0.193359375, + "learning_rate": 0.0001987622911815894, + "loss": 1.7654, + "step": 651 + }, + { + "epoch": 0.20381369177868083, + "grad_norm": 0.1875, + "learning_rate": 0.00019875843438054864, + "loss": 1.7043, + "step": 652 + }, + { + "epoch": 0.20412628946545797, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001987545716173354, + "loss": 1.966, + "step": 653 + }, + { + "epoch": 0.20443888715223507, + "grad_norm": 0.19921875, + "learning_rate": 0.0001987507028921828, + "loss": 1.7629, + "step": 654 + }, + { + "epoch": 0.2047514848390122, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019874682820532444, + "loss": 1.766, + "step": 655 + }, + { + "epoch": 0.2050640825257893, + "grad_norm": 0.201171875, + "learning_rate": 0.00019874294755699423, + "loss": 1.6821, + "step": 656 + }, + { + "epoch": 0.20537668021256641, + "grad_norm": 0.21875, + "learning_rate": 0.00019873906094742644, + "loss": 1.806, + "step": 657 + }, + { + "epoch": 0.20568927789934355, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001987351683768557, + "loss": 1.8864, + "step": 658 + }, + { + "epoch": 0.20600187558612065, + "grad_norm": 0.19921875, + "learning_rate": 0.00019873126984551703, + "loss": 1.7406, + "step": 659 + }, + { + "epoch": 0.20631447327289779, + "grad_norm": 0.201171875, + "learning_rate": 0.0001987273653536458, + "loss": 1.7246, + "step": 660 + }, + { + "epoch": 0.2066270709596749, + "grad_norm": 0.1875, + "learning_rate": 0.00019872345490147772, + "loss": 1.9874, + "step": 661 + }, + { + "epoch": 0.20693966864645202, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019871953848924886, + "loss": 1.7792, + "step": 662 + }, + { + "epoch": 0.20725226633322913, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019871561611719564, + "loss": 1.8759, + "step": 663 + }, + { + "epoch": 0.20756486402000626, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019871168778555492, + "loss": 1.9906, + "step": 664 + }, + { + "epoch": 0.20787746170678337, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001987077534945638, + "loss": 1.8973, + "step": 665 + }, + { + "epoch": 0.2081900593935605, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019870381324445978, + "loss": 1.6312, + "step": 666 + }, + { + "epoch": 0.2085026570803376, + "grad_norm": 0.208984375, + "learning_rate": 0.0001986998670354808, + "loss": 1.8406, + "step": 667 + }, + { + "epoch": 0.2088152547671147, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001986959148678651, + "loss": 1.7828, + "step": 668 + }, + { + "epoch": 0.20912785245389184, + "grad_norm": 0.201171875, + "learning_rate": 0.00019869195674185122, + "loss": 1.9185, + "step": 669 + }, + { + "epoch": 0.20944045014066895, + "grad_norm": 0.201171875, + "learning_rate": 0.00019868799265767816, + "loss": 1.7588, + "step": 670 + }, + { + "epoch": 0.20975304782744608, + "grad_norm": 0.203125, + "learning_rate": 0.00019868402261558524, + "loss": 1.7387, + "step": 671 + }, + { + "epoch": 0.2100656455142232, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019868004661581208, + "loss": 1.6164, + "step": 672 + }, + { + "epoch": 0.21037824320100032, + "grad_norm": 0.19140625, + "learning_rate": 0.0001986760646585988, + "loss": 1.8667, + "step": 673 + }, + { + "epoch": 0.21069084088777743, + "grad_norm": 0.189453125, + "learning_rate": 0.00019867207674418568, + "loss": 1.9312, + "step": 674 + }, + { + "epoch": 0.21100343857455456, + "grad_norm": 0.19921875, + "learning_rate": 0.0001986680828728136, + "loss": 1.7665, + "step": 675 + }, + { + "epoch": 0.21131603626133166, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019866408304472364, + "loss": 1.6056, + "step": 676 + }, + { + "epoch": 0.2116286339481088, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019866007726015723, + "loss": 1.5752, + "step": 677 + }, + { + "epoch": 0.2119412316348859, + "grad_norm": 0.208984375, + "learning_rate": 0.00019865606551935626, + "loss": 1.8815, + "step": 678 + }, + { + "epoch": 0.212253829321663, + "grad_norm": 0.203125, + "learning_rate": 0.00019865204782256287, + "loss": 1.7828, + "step": 679 + }, + { + "epoch": 0.21256642700844014, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001986480241700196, + "loss": 1.9457, + "step": 680 + }, + { + "epoch": 0.21287902469521724, + "grad_norm": 0.1953125, + "learning_rate": 0.00019864399456196946, + "loss": 1.9523, + "step": 681 + }, + { + "epoch": 0.21319162238199438, + "grad_norm": 0.19140625, + "learning_rate": 0.00019863995899865565, + "loss": 1.5974, + "step": 682 + }, + { + "epoch": 0.21350422006877148, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019863591748032184, + "loss": 1.8886, + "step": 683 + }, + { + "epoch": 0.21381681775554862, + "grad_norm": 0.201171875, + "learning_rate": 0.00019863187000721197, + "loss": 1.8564, + "step": 684 + }, + { + "epoch": 0.21412941544232572, + "grad_norm": 0.203125, + "learning_rate": 0.00019862781657957045, + "loss": 1.8022, + "step": 685 + }, + { + "epoch": 0.21444201312910285, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019862375719764192, + "loss": 1.855, + "step": 686 + }, + { + "epoch": 0.21475461081587996, + "grad_norm": 0.208984375, + "learning_rate": 0.0001986196918616715, + "loss": 2.0019, + "step": 687 + }, + { + "epoch": 0.2150672085026571, + "grad_norm": 0.1953125, + "learning_rate": 0.00019861562057190462, + "loss": 1.8597, + "step": 688 + }, + { + "epoch": 0.2153798061894342, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019861154332858708, + "loss": 1.9685, + "step": 689 + }, + { + "epoch": 0.2156924038762113, + "grad_norm": 0.197265625, + "learning_rate": 0.00019860746013196495, + "loss": 1.8702, + "step": 690 + }, + { + "epoch": 0.21600500156298844, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019860337098228485, + "loss": 1.6556, + "step": 691 + }, + { + "epoch": 0.21631759924976554, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019859927587979358, + "loss": 2.0366, + "step": 692 + }, + { + "epoch": 0.21663019693654267, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019859517482473838, + "loss": 1.9303, + "step": 693 + }, + { + "epoch": 0.21694279462331978, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019859106781736682, + "loss": 1.6981, + "step": 694 + }, + { + "epoch": 0.2172553923100969, + "grad_norm": 0.205078125, + "learning_rate": 0.00019858695485792686, + "loss": 1.4825, + "step": 695 + }, + { + "epoch": 0.21756798999687402, + "grad_norm": 0.19921875, + "learning_rate": 0.0001985828359466668, + "loss": 1.779, + "step": 696 + }, + { + "epoch": 0.21788058768365115, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019857871108383532, + "loss": 1.7535, + "step": 697 + }, + { + "epoch": 0.21819318537042826, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019857458026968143, + "loss": 1.7039, + "step": 698 + }, + { + "epoch": 0.2185057830572054, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001985704435044545, + "loss": 1.7501, + "step": 699 + }, + { + "epoch": 0.2188183807439825, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001985663007884043, + "loss": 1.8791, + "step": 700 + }, + { + "epoch": 0.2191309784307596, + "grad_norm": 0.20703125, + "learning_rate": 0.00019856215212178094, + "loss": 1.904, + "step": 701 + }, + { + "epoch": 0.21944357611753673, + "grad_norm": 0.2109375, + "learning_rate": 0.00019855799750483484, + "loss": 1.4772, + "step": 702 + }, + { + "epoch": 0.21975617380431384, + "grad_norm": 0.3984375, + "learning_rate": 0.00019855383693781682, + "loss": 2.4316, + "step": 703 + }, + { + "epoch": 0.22006877149109097, + "grad_norm": 0.2109375, + "learning_rate": 0.0001985496704209781, + "loss": 1.6331, + "step": 704 + }, + { + "epoch": 0.22038136917786808, + "grad_norm": 0.22265625, + "learning_rate": 0.0001985454979545702, + "loss": 1.7665, + "step": 705 + }, + { + "epoch": 0.2206939668646452, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019854131953884495, + "loss": 1.9052, + "step": 706 + }, + { + "epoch": 0.2210065645514223, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019853713517405472, + "loss": 1.8316, + "step": 707 + }, + { + "epoch": 0.22131916223819945, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019853294486045208, + "loss": 1.6123, + "step": 708 + }, + { + "epoch": 0.22163175992497655, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852874859828997, + "loss": 1.8111, + "step": 709 + }, + { + "epoch": 0.22194435761175368, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852454638782176, + "loss": 1.8234, + "step": 710 + }, + { + "epoch": 0.2222569552985308, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019852033822930114, + "loss": 1.6664, + "step": 711 + }, + { + "epoch": 0.22256955298530792, + "grad_norm": 0.201171875, + "learning_rate": 0.00019851612412298214, + "loss": 1.9896, + "step": 712 + }, + { + "epoch": 0.22288215067208503, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001985119040691192, + "loss": 1.6152, + "step": 713 + }, + { + "epoch": 0.22319474835886213, + "grad_norm": 0.19140625, + "learning_rate": 0.00019850767806796707, + "loss": 2.165, + "step": 714 + }, + { + "epoch": 0.22350734604563927, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019850344611978087, + "loss": 2.1852, + "step": 715 + }, + { + "epoch": 0.22381994373241637, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019849920822481614, + "loss": 1.7914, + "step": 716 + }, + { + "epoch": 0.2241325414191935, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019849496438332866, + "loss": 2.0296, + "step": 717 + }, + { + "epoch": 0.2244451391059706, + "grad_norm": 0.201171875, + "learning_rate": 0.0001984907145955747, + "loss": 1.7981, + "step": 718 + }, + { + "epoch": 0.22475773679274774, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019848645886181074, + "loss": 1.7928, + "step": 719 + }, + { + "epoch": 0.22507033447952485, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019848219718229378, + "loss": 1.8671, + "step": 720 + }, + { + "epoch": 0.22538293216630198, + "grad_norm": 0.203125, + "learning_rate": 0.00019847792955728107, + "loss": 1.8564, + "step": 721 + }, + { + "epoch": 0.22569552985307909, + "grad_norm": 0.20703125, + "learning_rate": 0.0001984736559870303, + "loss": 1.6293, + "step": 722 + }, + { + "epoch": 0.22600812753985622, + "grad_norm": 0.201171875, + "learning_rate": 0.0001984693764717994, + "loss": 1.9545, + "step": 723 + }, + { + "epoch": 0.22632072522663332, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019846509101184679, + "loss": 1.8173, + "step": 724 + }, + { + "epoch": 0.22663332291341043, + "grad_norm": 0.197265625, + "learning_rate": 0.00019846079960743112, + "loss": 1.649, + "step": 725 + }, + { + "epoch": 0.22694592060018756, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019845650225881154, + "loss": 1.8916, + "step": 726 + }, + { + "epoch": 0.22725851828696467, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019845219896624743, + "loss": 1.7553, + "step": 727 + }, + { + "epoch": 0.2275711159737418, + "grad_norm": 0.19140625, + "learning_rate": 0.0001984478897299986, + "loss": 1.6849, + "step": 728 + }, + { + "epoch": 0.2278837136605189, + "grad_norm": 0.19921875, + "learning_rate": 0.00019844357455032526, + "loss": 1.8667, + "step": 729 + }, + { + "epoch": 0.22819631134729604, + "grad_norm": 0.203125, + "learning_rate": 0.00019843925342748783, + "loss": 1.651, + "step": 730 + }, + { + "epoch": 0.22850890903407314, + "grad_norm": 0.19921875, + "learning_rate": 0.00019843492636174728, + "loss": 1.6074, + "step": 731 + }, + { + "epoch": 0.22882150672085028, + "grad_norm": 0.189453125, + "learning_rate": 0.00019843059335336474, + "loss": 1.8431, + "step": 732 + }, + { + "epoch": 0.22913410440762738, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019842625440260188, + "loss": 1.7872, + "step": 733 + }, + { + "epoch": 0.22944670209440451, + "grad_norm": 0.197265625, + "learning_rate": 0.0001984219095097206, + "loss": 1.6808, + "step": 734 + }, + { + "epoch": 0.22975929978118162, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019841755867498322, + "loss": 1.7461, + "step": 735 + }, + { + "epoch": 0.23007189746795872, + "grad_norm": 0.2109375, + "learning_rate": 0.00019841320189865243, + "loss": 1.7838, + "step": 736 + }, + { + "epoch": 0.23038449515473586, + "grad_norm": 0.20703125, + "learning_rate": 0.0001984088391809912, + "loss": 1.9104, + "step": 737 + }, + { + "epoch": 0.23069709284151296, + "grad_norm": 0.197265625, + "learning_rate": 0.00019840447052226298, + "loss": 1.9961, + "step": 738 + }, + { + "epoch": 0.2310096905282901, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019840009592273143, + "loss": 1.987, + "step": 739 + }, + { + "epoch": 0.2313222882150672, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019839571538266072, + "loss": 1.6382, + "step": 740 + }, + { + "epoch": 0.23163488590184433, + "grad_norm": 0.19921875, + "learning_rate": 0.0001983913289023153, + "loss": 1.6738, + "step": 741 + }, + { + "epoch": 0.23194748358862144, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019838693648195995, + "loss": 1.8182, + "step": 742 + }, + { + "epoch": 0.23226008127539857, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019838253812185988, + "loss": 1.598, + "step": 743 + }, + { + "epoch": 0.23257267896217568, + "grad_norm": 0.19140625, + "learning_rate": 0.00019837813382228063, + "loss": 1.7465, + "step": 744 + }, + { + "epoch": 0.2328852766489528, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019837372358348806, + "loss": 1.8831, + "step": 745 + }, + { + "epoch": 0.23319787433572992, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019836930740574845, + "loss": 1.525, + "step": 746 + }, + { + "epoch": 0.23351047202250702, + "grad_norm": 0.84375, + "learning_rate": 0.00019836488528932836, + "loss": 3.4084, + "step": 747 + }, + { + "epoch": 0.23382306970928415, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019836045723449483, + "loss": 1.7993, + "step": 748 + }, + { + "epoch": 0.23413566739606126, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019835602324151514, + "loss": 1.8971, + "step": 749 + }, + { + "epoch": 0.2344482650828384, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019835158331065703, + "loss": 1.7286, + "step": 750 + }, + { + "epoch": 0.2347608627696155, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019834713744218844, + "loss": 1.6018, + "step": 751 + }, + { + "epoch": 0.23507346045639263, + "grad_norm": 0.2109375, + "learning_rate": 0.00019834268563637787, + "loss": 1.8705, + "step": 752 + }, + { + "epoch": 0.23538605814316974, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019833822789349409, + "loss": 1.8121, + "step": 753 + }, + { + "epoch": 0.23569865582994687, + "grad_norm": 0.1953125, + "learning_rate": 0.00019833376421380612, + "loss": 1.6886, + "step": 754 + }, + { + "epoch": 0.23601125351672397, + "grad_norm": 0.205078125, + "learning_rate": 0.00019832929459758352, + "loss": 1.6922, + "step": 755 + }, + { + "epoch": 0.2363238512035011, + "grad_norm": 0.208984375, + "learning_rate": 0.0001983248190450961, + "loss": 1.7953, + "step": 756 + }, + { + "epoch": 0.2366364488902782, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019832033755661405, + "loss": 1.7892, + "step": 757 + }, + { + "epoch": 0.23694904657705532, + "grad_norm": 0.208984375, + "learning_rate": 0.00019831585013240793, + "loss": 1.5738, + "step": 758 + }, + { + "epoch": 0.23726164426383245, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001983113567727487, + "loss": 1.6719, + "step": 759 + }, + { + "epoch": 0.23757424195060955, + "grad_norm": 0.203125, + "learning_rate": 0.00019830685747790748, + "loss": 1.9564, + "step": 760 + }, + { + "epoch": 0.2378868396373867, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001983023522481561, + "loss": 2.1432, + "step": 761 + }, + { + "epoch": 0.2381994373241638, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001982978410837664, + "loss": 1.8179, + "step": 762 + }, + { + "epoch": 0.23851203501094093, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001982933239850108, + "loss": 1.9499, + "step": 763 + }, + { + "epoch": 0.23882463269771803, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019828880095216193, + "loss": 1.5989, + "step": 764 + }, + { + "epoch": 0.23913723038449516, + "grad_norm": 0.2109375, + "learning_rate": 0.00019828427198549293, + "loss": 1.8161, + "step": 765 + }, + { + "epoch": 0.23944982807127227, + "grad_norm": 0.208984375, + "learning_rate": 0.0001982797370852772, + "loss": 1.9312, + "step": 766 + }, + { + "epoch": 0.2397624257580494, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019827519625178846, + "loss": 1.7275, + "step": 767 + }, + { + "epoch": 0.2400750234448265, + "grad_norm": 0.203125, + "learning_rate": 0.0001982706494853009, + "loss": 2.0002, + "step": 768 + }, + { + "epoch": 0.2403876211316036, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019826609678608902, + "loss": 1.8021, + "step": 769 + }, + { + "epoch": 0.24070021881838075, + "grad_norm": 0.205078125, + "learning_rate": 0.00019826153815442763, + "loss": 1.5546, + "step": 770 + }, + { + "epoch": 0.24101281650515785, + "grad_norm": 0.212890625, + "learning_rate": 0.000198256973590592, + "loss": 1.5848, + "step": 771 + }, + { + "epoch": 0.24132541419193498, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019825240309485765, + "loss": 1.7763, + "step": 772 + }, + { + "epoch": 0.2416380118787121, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001982478266675005, + "loss": 1.6906, + "step": 773 + }, + { + "epoch": 0.24195060956548922, + "grad_norm": 0.21484375, + "learning_rate": 0.00019824324430879687, + "loss": 1.9644, + "step": 774 + }, + { + "epoch": 0.24226320725226633, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019823865601902341, + "loss": 1.9122, + "step": 775 + }, + { + "epoch": 0.24257580493904346, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019823406179845707, + "loss": 1.9017, + "step": 776 + }, + { + "epoch": 0.24288840262582057, + "grad_norm": 0.21875, + "learning_rate": 0.00019822946164737526, + "loss": 1.8361, + "step": 777 + }, + { + "epoch": 0.2432010003125977, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019822485556605566, + "loss": 1.7349, + "step": 778 + }, + { + "epoch": 0.2435135979993748, + "grad_norm": 0.212890625, + "learning_rate": 0.00019822024355477637, + "loss": 1.6017, + "step": 779 + }, + { + "epoch": 0.2438261956861519, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001982156256138158, + "loss": 1.8296, + "step": 780 + }, + { + "epoch": 0.24413879337292904, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019821100174345277, + "loss": 1.6754, + "step": 781 + }, + { + "epoch": 0.24445139105970615, + "grad_norm": 0.216796875, + "learning_rate": 0.0001982063719439664, + "loss": 2.0037, + "step": 782 + }, + { + "epoch": 0.24476398874648328, + "grad_norm": 0.2109375, + "learning_rate": 0.00019820173621563623, + "loss": 1.887, + "step": 783 + }, + { + "epoch": 0.24507658643326038, + "grad_norm": 0.19140625, + "learning_rate": 0.0001981970945587421, + "loss": 1.5708, + "step": 784 + }, + { + "epoch": 0.24538918412003752, + "grad_norm": 0.68359375, + "learning_rate": 0.0001981924469735642, + "loss": 2.3282, + "step": 785 + }, + { + "epoch": 0.24570178180681462, + "grad_norm": 0.203125, + "learning_rate": 0.00019818779346038318, + "loss": 1.7515, + "step": 786 + }, + { + "epoch": 0.24601437949359176, + "grad_norm": 0.20703125, + "learning_rate": 0.00019818313401947997, + "loss": 1.7623, + "step": 787 + }, + { + "epoch": 0.24632697718036886, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019817846865113577, + "loss": 1.8036, + "step": 788 + }, + { + "epoch": 0.246639574867146, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001981737973556324, + "loss": 1.8455, + "step": 789 + }, + { + "epoch": 0.2469521725539231, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001981691201332517, + "loss": 1.7791, + "step": 790 + }, + { + "epoch": 0.24726477024070023, + "grad_norm": 0.205078125, + "learning_rate": 0.00019816443698427615, + "loss": 2.0416, + "step": 791 + }, + { + "epoch": 0.24757736792747734, + "grad_norm": 0.2265625, + "learning_rate": 0.00019815974790898846, + "loss": 2.2271, + "step": 792 + }, + { + "epoch": 0.24788996561425444, + "grad_norm": 0.203125, + "learning_rate": 0.00019815505290767172, + "loss": 1.5433, + "step": 793 + }, + { + "epoch": 0.24820256330103158, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001981503519806093, + "loss": 1.7228, + "step": 794 + }, + { + "epoch": 0.24851516098780868, + "grad_norm": 0.20703125, + "learning_rate": 0.00019814564512808512, + "loss": 1.8217, + "step": 795 + }, + { + "epoch": 0.2488277586745858, + "grad_norm": 0.203125, + "learning_rate": 0.00019814093235038323, + "loss": 1.8205, + "step": 796 + }, + { + "epoch": 0.24914035636136292, + "grad_norm": 0.2109375, + "learning_rate": 0.00019813621364778817, + "loss": 1.8541, + "step": 797 + }, + { + "epoch": 0.24945295404814005, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001981314890205849, + "loss": 1.8656, + "step": 798 + }, + { + "epoch": 0.24976555173491716, + "grad_norm": 0.2109375, + "learning_rate": 0.00019812675846905855, + "loss": 1.809, + "step": 799 + }, + { + "epoch": 0.25007814942169426, + "grad_norm": 0.20703125, + "learning_rate": 0.00019812202199349476, + "loss": 2.0585, + "step": 800 + }, + { + "epoch": 0.2503907471084714, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019811727959417945, + "loss": 1.9492, + "step": 801 + }, + { + "epoch": 0.25070334479524853, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019811253127139896, + "loss": 1.8192, + "step": 802 + }, + { + "epoch": 0.25101594248202563, + "grad_norm": 0.2109375, + "learning_rate": 0.0001981077770254399, + "loss": 1.4981, + "step": 803 + }, + { + "epoch": 0.25132854016880274, + "grad_norm": 0.205078125, + "learning_rate": 0.00019810301685658935, + "loss": 1.8598, + "step": 804 + }, + { + "epoch": 0.25164113785557984, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019809825076513464, + "loss": 1.7946, + "step": 805 + }, + { + "epoch": 0.251953735542357, + "grad_norm": 0.23046875, + "learning_rate": 0.00019809347875136352, + "loss": 1.784, + "step": 806 + }, + { + "epoch": 0.2522663332291341, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019808870081556413, + "loss": 1.9401, + "step": 807 + }, + { + "epoch": 0.2525789309159112, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019808391695802483, + "loss": 2.0217, + "step": 808 + }, + { + "epoch": 0.2528915286026883, + "grad_norm": 0.20703125, + "learning_rate": 0.0001980791271790345, + "loss": 1.7354, + "step": 809 + }, + { + "epoch": 0.2532041262894655, + "grad_norm": 0.20703125, + "learning_rate": 0.00019807433147888225, + "loss": 2.1094, + "step": 810 + }, + { + "epoch": 0.2535167239762426, + "grad_norm": 0.8125, + "learning_rate": 0.00019806952985785764, + "loss": 2.8019, + "step": 811 + }, + { + "epoch": 0.2538293216630197, + "grad_norm": 0.193359375, + "learning_rate": 0.00019806472231625056, + "loss": 1.554, + "step": 812 + }, + { + "epoch": 0.2541419193497968, + "grad_norm": 0.1953125, + "learning_rate": 0.0001980599088543512, + "loss": 1.7158, + "step": 813 + }, + { + "epoch": 0.2544545170365739, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019805508947245021, + "loss": 1.934, + "step": 814 + }, + { + "epoch": 0.25476711472335106, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001980502641708385, + "loss": 2.0267, + "step": 815 + }, + { + "epoch": 0.25507971241012817, + "grad_norm": 0.20703125, + "learning_rate": 0.0001980454329498074, + "loss": 1.6819, + "step": 816 + }, + { + "epoch": 0.2553923100969053, + "grad_norm": 0.203125, + "learning_rate": 0.00019804059580964855, + "loss": 1.7279, + "step": 817 + }, + { + "epoch": 0.2557049077836824, + "grad_norm": 0.2109375, + "learning_rate": 0.00019803575275065404, + "loss": 1.6234, + "step": 818 + }, + { + "epoch": 0.25601750547045954, + "grad_norm": 0.205078125, + "learning_rate": 0.0001980309037731162, + "loss": 1.4631, + "step": 819 + }, + { + "epoch": 0.25633010315723664, + "grad_norm": 0.2109375, + "learning_rate": 0.00019802604887732774, + "loss": 1.7769, + "step": 820 + }, + { + "epoch": 0.25664270084401375, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019802118806358182, + "loss": 1.7928, + "step": 821 + }, + { + "epoch": 0.25695529853079085, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019801632133217189, + "loss": 1.639, + "step": 822 + }, + { + "epoch": 0.257267896217568, + "grad_norm": 0.212890625, + "learning_rate": 0.0001980114486833917, + "loss": 1.6918, + "step": 823 + }, + { + "epoch": 0.2575804939043451, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019800657011753548, + "loss": 1.8273, + "step": 824 + }, + { + "epoch": 0.2578930915911222, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001980016856348977, + "loss": 1.9625, + "step": 825 + }, + { + "epoch": 0.25820568927789933, + "grad_norm": 0.201171875, + "learning_rate": 0.00019799679523577332, + "loss": 1.741, + "step": 826 + }, + { + "epoch": 0.25851828696467644, + "grad_norm": 0.19921875, + "learning_rate": 0.00019799189892045748, + "loss": 2.0397, + "step": 827 + }, + { + "epoch": 0.2588308846514536, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019798699668924585, + "loss": 1.7246, + "step": 828 + }, + { + "epoch": 0.2591434823382307, + "grad_norm": 0.203125, + "learning_rate": 0.00019798208854243437, + "loss": 1.5622, + "step": 829 + }, + { + "epoch": 0.2594560800250078, + "grad_norm": 0.1953125, + "learning_rate": 0.00019797717448031936, + "loss": 1.4121, + "step": 830 + }, + { + "epoch": 0.2597686777117849, + "grad_norm": 0.20703125, + "learning_rate": 0.00019797225450319744, + "loss": 1.6693, + "step": 831 + }, + { + "epoch": 0.2600812753985621, + "grad_norm": 0.208984375, + "learning_rate": 0.0001979673286113657, + "loss": 1.6021, + "step": 832 + }, + { + "epoch": 0.2603938730853392, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001979623968051215, + "loss": 1.9199, + "step": 833 + }, + { + "epoch": 0.2607064707721163, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019795745908476254, + "loss": 2.0403, + "step": 834 + }, + { + "epoch": 0.2610190684588934, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019795251545058694, + "loss": 1.8294, + "step": 835 + }, + { + "epoch": 0.26133166614567055, + "grad_norm": 0.20703125, + "learning_rate": 0.00019794756590289317, + "loss": 1.8625, + "step": 836 + }, + { + "epoch": 0.26164426383244765, + "grad_norm": 0.21484375, + "learning_rate": 0.00019794261044198003, + "loss": 1.8086, + "step": 837 + }, + { + "epoch": 0.26195686151922476, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001979376490681467, + "loss": 1.6601, + "step": 838 + }, + { + "epoch": 0.26226945920600186, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019793268178169268, + "loss": 1.5396, + "step": 839 + }, + { + "epoch": 0.26258205689277897, + "grad_norm": 0.20703125, + "learning_rate": 0.00019792770858291788, + "loss": 1.7095, + "step": 840 + }, + { + "epoch": 0.26289465457955613, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019792272947212253, + "loss": 1.8782, + "step": 841 + }, + { + "epoch": 0.26320725226633324, + "grad_norm": 0.19921875, + "learning_rate": 0.00019791774444960717, + "loss": 1.9358, + "step": 842 + }, + { + "epoch": 0.26351984995311034, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019791275351567286, + "loss": 1.7342, + "step": 843 + }, + { + "epoch": 0.26383244763988745, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019790775667062081, + "loss": 1.7846, + "step": 844 + }, + { + "epoch": 0.2641450453266646, + "grad_norm": 0.201171875, + "learning_rate": 0.0001979027539147527, + "loss": 1.845, + "step": 845 + }, + { + "epoch": 0.2644576430134417, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001978977452483706, + "loss": 1.9035, + "step": 846 + }, + { + "epoch": 0.2647702407002188, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001978927306717769, + "loss": 1.5762, + "step": 847 + }, + { + "epoch": 0.2650828383869959, + "grad_norm": 0.1953125, + "learning_rate": 0.0001978877101852743, + "loss": 2.1721, + "step": 848 + }, + { + "epoch": 0.26539543607377303, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019788268378916586, + "loss": 1.7108, + "step": 849 + }, + { + "epoch": 0.2657080337605502, + "grad_norm": 1.0234375, + "learning_rate": 0.00019787765148375508, + "loss": 2.5699, + "step": 850 + }, + { + "epoch": 0.2660206314473273, + "grad_norm": 0.203125, + "learning_rate": 0.00019787261326934577, + "loss": 1.6568, + "step": 851 + }, + { + "epoch": 0.2663332291341044, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019786756914624208, + "loss": 1.8594, + "step": 852 + }, + { + "epoch": 0.2666458268208815, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019786251911474849, + "loss": 1.8597, + "step": 853 + }, + { + "epoch": 0.26695842450765866, + "grad_norm": 0.216796875, + "learning_rate": 0.00019785746317516994, + "loss": 2.0457, + "step": 854 + }, + { + "epoch": 0.26727102219443577, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019785240132781163, + "loss": 1.832, + "step": 855 + }, + { + "epoch": 0.2675836198812129, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019784733357297915, + "loss": 1.68, + "step": 856 + }, + { + "epoch": 0.26789621756799, + "grad_norm": 0.2109375, + "learning_rate": 0.00019784225991097848, + "loss": 1.8997, + "step": 857 + }, + { + "epoch": 0.26820881525476714, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019783718034211586, + "loss": 1.7594, + "step": 858 + }, + { + "epoch": 0.26852141294154425, + "grad_norm": 0.2138671875, + "learning_rate": 0.000197832094866698, + "loss": 1.7918, + "step": 859 + }, + { + "epoch": 0.26883401062832135, + "grad_norm": 0.203125, + "learning_rate": 0.00019782700348503193, + "loss": 1.6616, + "step": 860 + }, + { + "epoch": 0.26914660831509846, + "grad_norm": 0.201171875, + "learning_rate": 0.00019782190619742495, + "loss": 1.8357, + "step": 861 + }, + { + "epoch": 0.26945920600187556, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001978168030041849, + "loss": 1.74, + "step": 862 + }, + { + "epoch": 0.2697718036886527, + "grad_norm": 0.203125, + "learning_rate": 0.00019781169390561975, + "loss": 1.4934, + "step": 863 + }, + { + "epoch": 0.27008440137542983, + "grad_norm": 0.216796875, + "learning_rate": 0.000197806578902038, + "loss": 1.6285, + "step": 864 + }, + { + "epoch": 0.27039699906220693, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019780145799374848, + "loss": 1.5881, + "step": 865 + }, + { + "epoch": 0.27070959674898404, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019779633118106028, + "loss": 1.714, + "step": 866 + }, + { + "epoch": 0.2710221944357612, + "grad_norm": 0.2119140625, + "learning_rate": 0.000197791198464283, + "loss": 1.9303, + "step": 867 + }, + { + "epoch": 0.2713347921225383, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001977860598437264, + "loss": 1.6095, + "step": 868 + }, + { + "epoch": 0.2716473898093154, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019778091531970072, + "loss": 1.7565, + "step": 869 + }, + { + "epoch": 0.2719599874960925, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019777576489251664, + "loss": 1.5668, + "step": 870 + }, + { + "epoch": 0.2722725851828696, + "grad_norm": 0.20703125, + "learning_rate": 0.00019777060856248504, + "loss": 1.6762, + "step": 871 + }, + { + "epoch": 0.2725851828696468, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019776544632991717, + "loss": 1.7808, + "step": 872 + }, + { + "epoch": 0.2728977805564239, + "grad_norm": 0.203125, + "learning_rate": 0.00019776027819512474, + "loss": 1.8983, + "step": 873 + }, + { + "epoch": 0.273210378243201, + "grad_norm": 0.20703125, + "learning_rate": 0.00019775510415841977, + "loss": 1.837, + "step": 874 + }, + { + "epoch": 0.2735229759299781, + "grad_norm": 0.20703125, + "learning_rate": 0.00019774992422011452, + "loss": 1.7363, + "step": 875 + }, + { + "epoch": 0.27383557361675526, + "grad_norm": 0.2109375, + "learning_rate": 0.00019774473838052184, + "loss": 1.8509, + "step": 876 + }, + { + "epoch": 0.27414817130353236, + "grad_norm": 0.23046875, + "learning_rate": 0.00019773954663995476, + "loss": 1.8239, + "step": 877 + }, + { + "epoch": 0.27446076899030947, + "grad_norm": 0.205078125, + "learning_rate": 0.00019773434899872665, + "loss": 2.0633, + "step": 878 + }, + { + "epoch": 0.2747733666770866, + "grad_norm": 0.21484375, + "learning_rate": 0.00019772914545715135, + "loss": 2.0269, + "step": 879 + }, + { + "epoch": 0.27508596436386373, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019772393601554303, + "loss": 1.7389, + "step": 880 + }, + { + "epoch": 0.27539856205064084, + "grad_norm": 0.212890625, + "learning_rate": 0.00019771872067421615, + "loss": 2.0936, + "step": 881 + }, + { + "epoch": 0.27571115973741794, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019771349943348558, + "loss": 1.7132, + "step": 882 + }, + { + "epoch": 0.27602375742419505, + "grad_norm": 0.21484375, + "learning_rate": 0.00019770827229366654, + "loss": 1.6179, + "step": 883 + }, + { + "epoch": 0.27633635511097215, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019770303925507456, + "loss": 1.9907, + "step": 884 + }, + { + "epoch": 0.2766489527977493, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001976978003180256, + "loss": 1.5918, + "step": 885 + }, + { + "epoch": 0.2769615504845264, + "grad_norm": 0.2060546875, + "learning_rate": 0.000197692555482836, + "loss": 1.6069, + "step": 886 + }, + { + "epoch": 0.2772741481713035, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019768730474982227, + "loss": 1.9966, + "step": 887 + }, + { + "epoch": 0.27758674585808063, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019768204811930148, + "loss": 1.7923, + "step": 888 + }, + { + "epoch": 0.2778993435448578, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019767678559159098, + "loss": 1.6497, + "step": 889 + }, + { + "epoch": 0.2782119412316349, + "grad_norm": 0.2109375, + "learning_rate": 0.00019767151716700845, + "loss": 1.9629, + "step": 890 + }, + { + "epoch": 0.278524538918412, + "grad_norm": 0.20703125, + "learning_rate": 0.00019766624284587195, + "loss": 1.8348, + "step": 891 + }, + { + "epoch": 0.2788371366051891, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019766096262849994, + "loss": 1.8409, + "step": 892 + }, + { + "epoch": 0.27914973429196627, + "grad_norm": 0.208984375, + "learning_rate": 0.00019765567651521115, + "loss": 1.7796, + "step": 893 + }, + { + "epoch": 0.2794623319787434, + "grad_norm": 0.201171875, + "learning_rate": 0.00019765038450632476, + "loss": 1.9009, + "step": 894 + }, + { + "epoch": 0.2797749296655205, + "grad_norm": 0.203125, + "learning_rate": 0.00019764508660216019, + "loss": 1.4491, + "step": 895 + }, + { + "epoch": 0.2800875273522976, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001976397828030373, + "loss": 1.5436, + "step": 896 + }, + { + "epoch": 0.2804001250390747, + "grad_norm": 0.2109375, + "learning_rate": 0.0001976344731092763, + "loss": 1.8577, + "step": 897 + }, + { + "epoch": 0.28071272272585185, + "grad_norm": 0.3828125, + "learning_rate": 0.0001976291575211978, + "loss": 2.6341, + "step": 898 + }, + { + "epoch": 0.28102532041262895, + "grad_norm": 0.203125, + "learning_rate": 0.00019762383603912258, + "loss": 1.6624, + "step": 899 + }, + { + "epoch": 0.28133791809940606, + "grad_norm": 0.205078125, + "learning_rate": 0.000197618508663372, + "loss": 1.6193, + "step": 900 + }, + { + "epoch": 0.28165051578618316, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019761317539426765, + "loss": 1.6416, + "step": 901 + }, + { + "epoch": 0.2819631134729603, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019760783623213153, + "loss": 1.5813, + "step": 902 + }, + { + "epoch": 0.28227571115973743, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019760249117728592, + "loss": 2.1245, + "step": 903 + }, + { + "epoch": 0.28258830884651454, + "grad_norm": 0.236328125, + "learning_rate": 0.00019759714023005357, + "loss": 2.0305, + "step": 904 + }, + { + "epoch": 0.28290090653329164, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001975917833907575, + "loss": 1.4689, + "step": 905 + }, + { + "epoch": 0.28321350422006875, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019758642065972112, + "loss": 1.9306, + "step": 906 + }, + { + "epoch": 0.2835261019068459, + "grad_norm": 0.212890625, + "learning_rate": 0.0001975810520372681, + "loss": 1.8309, + "step": 907 + }, + { + "epoch": 0.283838699593623, + "grad_norm": 0.216796875, + "learning_rate": 0.0001975756775237227, + "loss": 1.732, + "step": 908 + }, + { + "epoch": 0.2841512972804001, + "grad_norm": 0.208984375, + "learning_rate": 0.00019757029711940923, + "loss": 1.5233, + "step": 909 + }, + { + "epoch": 0.2844638949671772, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019756491082465263, + "loss": 1.6491, + "step": 910 + }, + { + "epoch": 0.2847764926539544, + "grad_norm": 0.201171875, + "learning_rate": 0.00019755951863977805, + "loss": 2.2236, + "step": 911 + }, + { + "epoch": 0.2850890903407315, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019755412056511097, + "loss": 1.8299, + "step": 912 + }, + { + "epoch": 0.2854016880275086, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019754871660097734, + "loss": 1.5403, + "step": 913 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019754330674770339, + "loss": 1.5712, + "step": 914 + }, + { + "epoch": 0.28602688340106286, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019753789100561569, + "loss": 1.8814, + "step": 915 + }, + { + "epoch": 0.28633948108783996, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001975324693750412, + "loss": 1.7153, + "step": 916 + }, + { + "epoch": 0.28665207877461707, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001975270418563073, + "loss": 2.0221, + "step": 917 + }, + { + "epoch": 0.2869646764613942, + "grad_norm": 0.2109375, + "learning_rate": 0.00019752160844974158, + "loss": 1.7176, + "step": 918 + }, + { + "epoch": 0.2872772741481713, + "grad_norm": 0.208984375, + "learning_rate": 0.0001975161691556721, + "loss": 1.8581, + "step": 919 + }, + { + "epoch": 0.28758987183494844, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019751072397442715, + "loss": 1.8127, + "step": 920 + }, + { + "epoch": 0.28790246952172555, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001975052729063356, + "loss": 1.5183, + "step": 921 + }, + { + "epoch": 0.28821506720850265, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019749981595172647, + "loss": 1.7887, + "step": 922 + }, + { + "epoch": 0.28852766489527976, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019749435311092919, + "loss": 1.7053, + "step": 923 + }, + { + "epoch": 0.2888402625820569, + "grad_norm": 0.212890625, + "learning_rate": 0.00019748888438427358, + "loss": 1.7008, + "step": 924 + }, + { + "epoch": 0.289152860268834, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019748340977208977, + "loss": 1.888, + "step": 925 + }, + { + "epoch": 0.2894654579556111, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001974779292747083, + "loss": 1.708, + "step": 926 + }, + { + "epoch": 0.28977805564238823, + "grad_norm": 0.21484375, + "learning_rate": 0.00019747244289246006, + "loss": 1.8244, + "step": 927 + }, + { + "epoch": 0.29009065332916534, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001974669506256762, + "loss": 1.5614, + "step": 928 + }, + { + "epoch": 0.2904032510159425, + "grad_norm": 0.38671875, + "learning_rate": 0.00019746145247468832, + "loss": 2.2925, + "step": 929 + }, + { + "epoch": 0.2907158487027196, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019745594843982836, + "loss": 1.7933, + "step": 930 + }, + { + "epoch": 0.2910284463894967, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001974504385214286, + "loss": 1.8521, + "step": 931 + }, + { + "epoch": 0.2913410440762738, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019744492271982168, + "loss": 1.6939, + "step": 932 + }, + { + "epoch": 0.291653641763051, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019743940103534062, + "loss": 1.6783, + "step": 933 + }, + { + "epoch": 0.2919662394498281, + "grad_norm": 0.203125, + "learning_rate": 0.00019743387346831876, + "loss": 2.0204, + "step": 934 + }, + { + "epoch": 0.2922788371366052, + "grad_norm": 0.197265625, + "learning_rate": 0.00019742834001908977, + "loss": 1.7812, + "step": 935 + }, + { + "epoch": 0.2925914348233823, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019742280068798775, + "loss": 1.7483, + "step": 936 + }, + { + "epoch": 0.29290403251015945, + "grad_norm": 0.21484375, + "learning_rate": 0.00019741725547534712, + "loss": 1.8223, + "step": 937 + }, + { + "epoch": 0.29321663019693656, + "grad_norm": 0.20703125, + "learning_rate": 0.0001974117043815026, + "loss": 1.8306, + "step": 938 + }, + { + "epoch": 0.29352922788371366, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019740614740678937, + "loss": 1.9111, + "step": 939 + }, + { + "epoch": 0.29384182557049077, + "grad_norm": 0.224609375, + "learning_rate": 0.0001974005845515429, + "loss": 1.7384, + "step": 940 + }, + { + "epoch": 0.29415442325726787, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019739501581609903, + "loss": 1.7809, + "step": 941 + }, + { + "epoch": 0.29446702094404503, + "grad_norm": 0.212890625, + "learning_rate": 0.00019738944120079393, + "loss": 1.8266, + "step": 942 + }, + { + "epoch": 0.29477961863082214, + "grad_norm": 0.203125, + "learning_rate": 0.0001973838607059642, + "loss": 2.0459, + "step": 943 + }, + { + "epoch": 0.29509221631759924, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019737827433194665, + "loss": 1.7519, + "step": 944 + }, + { + "epoch": 0.29540481400437635, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001973726820790786, + "loss": 1.6264, + "step": 945 + }, + { + "epoch": 0.2957174116911535, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019736708394769764, + "loss": 1.6892, + "step": 946 + }, + { + "epoch": 0.2960300093779306, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019736147993814176, + "loss": 1.9491, + "step": 947 + }, + { + "epoch": 0.2963426070647077, + "grad_norm": 0.1953125, + "learning_rate": 0.00019735587005074927, + "loss": 1.7754, + "step": 948 + }, + { + "epoch": 0.2966552047514848, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019735025428585886, + "loss": 1.9126, + "step": 949 + }, + { + "epoch": 0.29696780243826193, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019734463264380953, + "loss": 2.071, + "step": 950 + }, + { + "epoch": 0.2972804001250391, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001973390051249407, + "loss": 1.6336, + "step": 951 + }, + { + "epoch": 0.2975929978118162, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019733337172959204, + "loss": 1.4598, + "step": 952 + }, + { + "epoch": 0.2979055954985933, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001973277324581037, + "loss": 1.5984, + "step": 953 + }, + { + "epoch": 0.2982181931853704, + "grad_norm": 0.21875, + "learning_rate": 0.00019732208731081615, + "loss": 1.9082, + "step": 954 + }, + { + "epoch": 0.29853079087214757, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019731643628807018, + "loss": 1.6075, + "step": 955 + }, + { + "epoch": 0.29884338855892467, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019731077939020693, + "loss": 1.9933, + "step": 956 + }, + { + "epoch": 0.2991559862457018, + "grad_norm": 0.20703125, + "learning_rate": 0.00019730511661756792, + "loss": 1.5719, + "step": 957 + }, + { + "epoch": 0.2994685839324789, + "grad_norm": 0.20703125, + "learning_rate": 0.00019729944797049502, + "loss": 1.6318, + "step": 958 + }, + { + "epoch": 0.29978118161925604, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019729377344933043, + "loss": 1.8574, + "step": 959 + }, + { + "epoch": 0.30009377930603315, + "grad_norm": 0.208984375, + "learning_rate": 0.0001972880930544168, + "loss": 1.9144, + "step": 960 + }, + { + "epoch": 0.30040637699281025, + "grad_norm": 0.2001953125, + "learning_rate": 0.000197282406786097, + "loss": 1.7335, + "step": 961 + }, + { + "epoch": 0.30071897467958736, + "grad_norm": 0.203125, + "learning_rate": 0.00019727671464471436, + "loss": 1.7289, + "step": 962 + }, + { + "epoch": 0.30103157236636446, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019727101663061247, + "loss": 1.994, + "step": 963 + }, + { + "epoch": 0.3013441700531416, + "grad_norm": 0.205078125, + "learning_rate": 0.00019726531274413532, + "loss": 1.7233, + "step": 964 + }, + { + "epoch": 0.30165676773991873, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019725960298562733, + "loss": 1.8961, + "step": 965 + }, + { + "epoch": 0.30196936542669583, + "grad_norm": 0.21484375, + "learning_rate": 0.00019725388735543318, + "loss": 1.6978, + "step": 966 + }, + { + "epoch": 0.30228196311347294, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001972481658538979, + "loss": 1.752, + "step": 967 + }, + { + "epoch": 0.3025945608002501, + "grad_norm": 0.205078125, + "learning_rate": 0.00019724243848136692, + "loss": 2.0531, + "step": 968 + }, + { + "epoch": 0.3029071584870272, + "grad_norm": 0.208984375, + "learning_rate": 0.000197236705238186, + "loss": 1.7117, + "step": 969 + }, + { + "epoch": 0.3032197561738043, + "grad_norm": 0.20703125, + "learning_rate": 0.00019723096612470133, + "loss": 1.5911, + "step": 970 + }, + { + "epoch": 0.3035323538605814, + "grad_norm": 0.20703125, + "learning_rate": 0.00019722522114125929, + "loss": 1.8811, + "step": 971 + }, + { + "epoch": 0.3038449515473586, + "grad_norm": 0.22265625, + "learning_rate": 0.00019721947028820676, + "loss": 1.6444, + "step": 972 + }, + { + "epoch": 0.3041575492341357, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001972137135658909, + "loss": 1.5187, + "step": 973 + }, + { + "epoch": 0.3044701469209128, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001972079509746593, + "loss": 1.6957, + "step": 974 + }, + { + "epoch": 0.3047827446076899, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019720218251485983, + "loss": 1.5887, + "step": 975 + }, + { + "epoch": 0.305095342294467, + "grad_norm": 0.216796875, + "learning_rate": 0.0001971964081868407, + "loss": 1.7837, + "step": 976 + }, + { + "epoch": 0.30540793998124416, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001971906279909506, + "loss": 1.8848, + "step": 977 + }, + { + "epoch": 0.30572053766802126, + "grad_norm": 0.224609375, + "learning_rate": 0.0001971848419275384, + "loss": 1.8966, + "step": 978 + }, + { + "epoch": 0.30603313535479837, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019717904999695348, + "loss": 1.6581, + "step": 979 + }, + { + "epoch": 0.3063457330415755, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019717325219954543, + "loss": 1.6071, + "step": 980 + }, + { + "epoch": 0.30665833072835263, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019716744853566436, + "loss": 1.8169, + "step": 981 + }, + { + "epoch": 0.30697092841512974, + "grad_norm": 0.197265625, + "learning_rate": 0.0001971616390056606, + "loss": 1.6017, + "step": 982 + }, + { + "epoch": 0.30728352610190685, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019715582360988482, + "loss": 1.6999, + "step": 983 + }, + { + "epoch": 0.30759612378868395, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019715000234868821, + "loss": 1.7758, + "step": 984 + }, + { + "epoch": 0.30790872147546106, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019714417522242214, + "loss": 1.9776, + "step": 985 + }, + { + "epoch": 0.3082213191622382, + "grad_norm": 0.2265625, + "learning_rate": 0.00019713834223143844, + "loss": 1.7776, + "step": 986 + }, + { + "epoch": 0.3085339168490153, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019713250337608922, + "loss": 1.8847, + "step": 987 + }, + { + "epoch": 0.3088465145357924, + "grad_norm": 0.2109375, + "learning_rate": 0.000197126658656727, + "loss": 1.8091, + "step": 988 + }, + { + "epoch": 0.30915911222256953, + "grad_norm": 0.212890625, + "learning_rate": 0.00019712080807370464, + "loss": 1.804, + "step": 989 + }, + { + "epoch": 0.3094717099093467, + "grad_norm": 0.22265625, + "learning_rate": 0.00019711495162737529, + "loss": 1.782, + "step": 990 + }, + { + "epoch": 0.3097843075961238, + "grad_norm": 0.201171875, + "learning_rate": 0.0001971090893180926, + "loss": 1.5211, + "step": 991 + }, + { + "epoch": 0.3100969052829009, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001971032211462104, + "loss": 1.4168, + "step": 992 + }, + { + "epoch": 0.310409502969678, + "grad_norm": 0.212890625, + "learning_rate": 0.00019709734711208303, + "loss": 1.5656, + "step": 993 + }, + { + "epoch": 0.31072210065645517, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019709146721606509, + "loss": 1.818, + "step": 994 + }, + { + "epoch": 0.3110346983432323, + "grad_norm": 0.205078125, + "learning_rate": 0.00019708558145851152, + "loss": 1.7158, + "step": 995 + }, + { + "epoch": 0.3113472960300094, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001970796898397777, + "loss": 1.6944, + "step": 996 + }, + { + "epoch": 0.3116598937167865, + "grad_norm": 0.2109375, + "learning_rate": 0.0001970737923602193, + "loss": 1.7961, + "step": 997 + }, + { + "epoch": 0.3119724914035636, + "grad_norm": 0.201171875, + "learning_rate": 0.00019706788902019233, + "loss": 1.8871, + "step": 998 + }, + { + "epoch": 0.31228508909034075, + "grad_norm": 0.205078125, + "learning_rate": 0.00019706197982005322, + "loss": 1.8513, + "step": 999 + }, + { + "epoch": 0.31259768677711786, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001970560647601587, + "loss": 1.6529, + "step": 1000 + }, + { + "epoch": 0.31291028446389496, + "grad_norm": 0.21875, + "learning_rate": 0.0001970501438408659, + "loss": 1.7564, + "step": 1001 + }, + { + "epoch": 0.31322288215067207, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001970442170625322, + "loss": 1.5718, + "step": 1002 + }, + { + "epoch": 0.3135354798374492, + "grad_norm": 0.201171875, + "learning_rate": 0.00019703828442551547, + "loss": 1.9791, + "step": 1003 + }, + { + "epoch": 0.31384807752422633, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019703234593017386, + "loss": 1.5583, + "step": 1004 + }, + { + "epoch": 0.31416067521100344, + "grad_norm": 0.19921875, + "learning_rate": 0.00019702640157686586, + "loss": 1.8005, + "step": 1005 + }, + { + "epoch": 0.31447327289778054, + "grad_norm": 0.216796875, + "learning_rate": 0.00019702045136595032, + "loss": 2.0622, + "step": 1006 + }, + { + "epoch": 0.31478587058455765, + "grad_norm": 0.19921875, + "learning_rate": 0.00019701449529778656, + "loss": 1.6313, + "step": 1007 + }, + { + "epoch": 0.3150984682713348, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019700853337273406, + "loss": 1.7088, + "step": 1008 + }, + { + "epoch": 0.3154110659581119, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001970025655911528, + "loss": 1.7942, + "step": 1009 + }, + { + "epoch": 0.315723663644889, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019699659195340303, + "loss": 1.8139, + "step": 1010 + }, + { + "epoch": 0.3160362613316661, + "grad_norm": 0.212890625, + "learning_rate": 0.0001969906124598454, + "loss": 1.6704, + "step": 1011 + }, + { + "epoch": 0.3163488590184433, + "grad_norm": 0.2109375, + "learning_rate": 0.00019698462711084091, + "loss": 1.9731, + "step": 1012 + }, + { + "epoch": 0.3166614567052204, + "grad_norm": 0.2109375, + "learning_rate": 0.00019697863590675086, + "loss": 1.6923, + "step": 1013 + }, + { + "epoch": 0.3169740543919975, + "grad_norm": 0.21484375, + "learning_rate": 0.00019697263884793702, + "loss": 1.8974, + "step": 1014 + }, + { + "epoch": 0.3172866520787746, + "grad_norm": 0.212890625, + "learning_rate": 0.0001969666359347614, + "loss": 2.0298, + "step": 1015 + }, + { + "epoch": 0.31759924976555176, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019696062716758638, + "loss": 1.6155, + "step": 1016 + }, + { + "epoch": 0.31791184745232887, + "grad_norm": 0.212890625, + "learning_rate": 0.00019695461254677475, + "loss": 1.6622, + "step": 1017 + }, + { + "epoch": 0.31822444513910597, + "grad_norm": 0.201171875, + "learning_rate": 0.00019694859207268958, + "loss": 2.0245, + "step": 1018 + }, + { + "epoch": 0.3185370428258831, + "grad_norm": 0.205078125, + "learning_rate": 0.0001969425657456944, + "loss": 1.7654, + "step": 1019 + }, + { + "epoch": 0.3188496405126602, + "grad_norm": 0.203125, + "learning_rate": 0.00019693653356615297, + "loss": 1.6629, + "step": 1020 + }, + { + "epoch": 0.31916223819943734, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019693049553442952, + "loss": 1.7823, + "step": 1021 + }, + { + "epoch": 0.31947483588621445, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001969244516508885, + "loss": 1.5993, + "step": 1022 + }, + { + "epoch": 0.31978743357299155, + "grad_norm": 0.2109375, + "learning_rate": 0.0001969184019158948, + "loss": 1.7385, + "step": 1023 + }, + { + "epoch": 0.32010003125976866, + "grad_norm": 0.220703125, + "learning_rate": 0.00019691234632981372, + "loss": 2.0781, + "step": 1024 + }, + { + "epoch": 0.3204126289465458, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019690628489301077, + "loss": 1.6396, + "step": 1025 + }, + { + "epoch": 0.3207252266333229, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019690021760585192, + "loss": 1.7066, + "step": 1026 + }, + { + "epoch": 0.32103782432010003, + "grad_norm": 0.216796875, + "learning_rate": 0.00019689414446870344, + "loss": 1.6741, + "step": 1027 + }, + { + "epoch": 0.32135042200687713, + "grad_norm": 0.2109375, + "learning_rate": 0.000196888065481932, + "loss": 1.8628, + "step": 1028 + }, + { + "epoch": 0.32166301969365424, + "grad_norm": 0.234375, + "learning_rate": 0.00019688198064590458, + "loss": 1.8129, + "step": 1029 + }, + { + "epoch": 0.3219756173804314, + "grad_norm": 0.203125, + "learning_rate": 0.00019687588996098853, + "loss": 1.9068, + "step": 1030 + }, + { + "epoch": 0.3222882150672085, + "grad_norm": 0.2109375, + "learning_rate": 0.00019686979342755154, + "loss": 1.8664, + "step": 1031 + }, + { + "epoch": 0.3226008127539856, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001968636910459617, + "loss": 1.7239, + "step": 1032 + }, + { + "epoch": 0.3229134104407627, + "grad_norm": 0.201171875, + "learning_rate": 0.00019685758281658738, + "loss": 1.9294, + "step": 1033 + }, + { + "epoch": 0.3232260081275399, + "grad_norm": 0.20703125, + "learning_rate": 0.00019685146873979736, + "loss": 1.7469, + "step": 1034 + }, + { + "epoch": 0.323538605814317, + "grad_norm": 0.208984375, + "learning_rate": 0.00019684534881596078, + "loss": 1.8425, + "step": 1035 + }, + { + "epoch": 0.3238512035010941, + "grad_norm": 0.208984375, + "learning_rate": 0.00019683922304544705, + "loss": 1.5658, + "step": 1036 + }, + { + "epoch": 0.3241638011878712, + "grad_norm": 0.20703125, + "learning_rate": 0.000196833091428626, + "loss": 1.7025, + "step": 1037 + }, + { + "epoch": 0.32447639887464835, + "grad_norm": 0.20703125, + "learning_rate": 0.00019682695396586785, + "loss": 1.7166, + "step": 1038 + }, + { + "epoch": 0.32478899656142546, + "grad_norm": 0.220703125, + "learning_rate": 0.00019682081065754313, + "loss": 1.8159, + "step": 1039 + }, + { + "epoch": 0.32510159424820256, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019681466150402266, + "loss": 1.7957, + "step": 1040 + }, + { + "epoch": 0.32541419193497967, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001968085065056777, + "loss": 1.6375, + "step": 1041 + }, + { + "epoch": 0.3257267896217568, + "grad_norm": 0.21484375, + "learning_rate": 0.00019680234566287985, + "loss": 2.1855, + "step": 1042 + }, + { + "epoch": 0.32603938730853393, + "grad_norm": 0.20703125, + "learning_rate": 0.00019679617897600102, + "loss": 1.8348, + "step": 1043 + }, + { + "epoch": 0.32635198499531104, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019679000644541356, + "loss": 1.6444, + "step": 1044 + }, + { + "epoch": 0.32666458268208814, + "grad_norm": 0.205078125, + "learning_rate": 0.00019678382807149003, + "loss": 1.8918, + "step": 1045 + }, + { + "epoch": 0.32697718036886525, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019677764385460348, + "loss": 1.6544, + "step": 1046 + }, + { + "epoch": 0.3272897780556424, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019677145379512723, + "loss": 1.8734, + "step": 1047 + }, + { + "epoch": 0.3276023757424195, + "grad_norm": 0.208984375, + "learning_rate": 0.00019676525789343502, + "loss": 1.8792, + "step": 1048 + }, + { + "epoch": 0.3279149734291966, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019675905614990085, + "loss": 1.8914, + "step": 1049 + }, + { + "epoch": 0.3282275711159737, + "grad_norm": 0.20703125, + "learning_rate": 0.0001967528485648992, + "loss": 1.6186, + "step": 1050 + }, + { + "epoch": 0.3285401688027509, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019674663513880475, + "loss": 1.7937, + "step": 1051 + }, + { + "epoch": 0.328852766489528, + "grad_norm": 0.203125, + "learning_rate": 0.00019674041587199268, + "loss": 1.7155, + "step": 1052 + }, + { + "epoch": 0.3291653641763051, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001967341907648384, + "loss": 1.8787, + "step": 1053 + }, + { + "epoch": 0.3294779618630822, + "grad_norm": 0.220703125, + "learning_rate": 0.00019672795981771777, + "loss": 1.6195, + "step": 1054 + }, + { + "epoch": 0.3297905595498593, + "grad_norm": 0.203125, + "learning_rate": 0.00019672172303100696, + "loss": 1.9987, + "step": 1055 + }, + { + "epoch": 0.33010315723663647, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019671548040508244, + "loss": 1.6107, + "step": 1056 + }, + { + "epoch": 0.3304157549234136, + "grad_norm": 0.20703125, + "learning_rate": 0.00019670923194032116, + "loss": 1.6394, + "step": 1057 + }, + { + "epoch": 0.3307283526101907, + "grad_norm": 0.19921875, + "learning_rate": 0.00019670297763710028, + "loss": 1.7142, + "step": 1058 + }, + { + "epoch": 0.3310409502969678, + "grad_norm": 0.205078125, + "learning_rate": 0.00019669671749579742, + "loss": 1.8344, + "step": 1059 + }, + { + "epoch": 0.33135354798374494, + "grad_norm": 0.212890625, + "learning_rate": 0.0001966904515167905, + "loss": 1.933, + "step": 1060 + }, + { + "epoch": 0.33166614567052205, + "grad_norm": 0.2041015625, + "learning_rate": 0.0001966841797004578, + "loss": 1.763, + "step": 1061 + }, + { + "epoch": 0.33197874335729916, + "grad_norm": 0.2041015625, + "learning_rate": 0.000196677902047178, + "loss": 1.8741, + "step": 1062 + }, + { + "epoch": 0.33229134104407626, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019667161855733002, + "loss": 1.8624, + "step": 1063 + }, + { + "epoch": 0.33260393873085337, + "grad_norm": 0.216796875, + "learning_rate": 0.00019666532923129327, + "loss": 1.899, + "step": 1064 + }, + { + "epoch": 0.3329165364176305, + "grad_norm": 0.30078125, + "learning_rate": 0.00019665903406944737, + "loss": 2.3084, + "step": 1065 + }, + { + "epoch": 0.33322913410440763, + "grad_norm": 0.197265625, + "learning_rate": 0.00019665273307217245, + "loss": 1.6737, + "step": 1066 + }, + { + "epoch": 0.33354173179118474, + "grad_norm": 0.216796875, + "learning_rate": 0.00019664642623984886, + "loss": 1.6899, + "step": 1067 + }, + { + "epoch": 0.33385432947796184, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019664011357285735, + "loss": 1.8702, + "step": 1068 + }, + { + "epoch": 0.334166927164739, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019663379507157903, + "loss": 2.0766, + "step": 1069 + }, + { + "epoch": 0.3344795248515161, + "grad_norm": 0.1953125, + "learning_rate": 0.00019662747073639537, + "loss": 1.9336, + "step": 1070 + }, + { + "epoch": 0.3347921225382932, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019662114056768815, + "loss": 1.8872, + "step": 1071 + }, + { + "epoch": 0.3351047202250703, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019661480456583958, + "loss": 1.7719, + "step": 1072 + }, + { + "epoch": 0.3354173179118475, + "grad_norm": 0.220703125, + "learning_rate": 0.00019660846273123213, + "loss": 1.695, + "step": 1073 + }, + { + "epoch": 0.3357299155986246, + "grad_norm": 0.208984375, + "learning_rate": 0.00019660211506424867, + "loss": 1.8269, + "step": 1074 + }, + { + "epoch": 0.3360425132854017, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001965957615652724, + "loss": 1.8746, + "step": 1075 + }, + { + "epoch": 0.3363551109721788, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019658940223468693, + "loss": 1.5041, + "step": 1076 + }, + { + "epoch": 0.3366677086589559, + "grad_norm": 0.22265625, + "learning_rate": 0.00019658303707287617, + "loss": 1.8079, + "step": 1077 + }, + { + "epoch": 0.33698030634573306, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019657666608022438, + "loss": 1.7644, + "step": 1078 + }, + { + "epoch": 0.33729290403251017, + "grad_norm": 0.212890625, + "learning_rate": 0.00019657028925711617, + "loss": 1.759, + "step": 1079 + }, + { + "epoch": 0.33760550171928727, + "grad_norm": 0.220703125, + "learning_rate": 0.00019656390660393659, + "loss": 1.9192, + "step": 1080 + }, + { + "epoch": 0.3379180994060644, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019655751812107085, + "loss": 1.9153, + "step": 1081 + }, + { + "epoch": 0.33823069709284154, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019655112380890475, + "loss": 1.688, + "step": 1082 + }, + { + "epoch": 0.33854329477961864, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019654472366782425, + "loss": 1.907, + "step": 1083 + }, + { + "epoch": 0.33885589246639575, + "grad_norm": 0.212890625, + "learning_rate": 0.00019653831769821575, + "loss": 1.9453, + "step": 1084 + }, + { + "epoch": 0.33916849015317285, + "grad_norm": 0.2099609375, + "learning_rate": 0.000196531905900466, + "loss": 1.6311, + "step": 1085 + }, + { + "epoch": 0.33948108783994996, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019652548827496207, + "loss": 1.9493, + "step": 1086 + }, + { + "epoch": 0.3397936855267271, + "grad_norm": 0.208984375, + "learning_rate": 0.0001965190648220914, + "loss": 1.8175, + "step": 1087 + }, + { + "epoch": 0.3401062832135042, + "grad_norm": 0.19921875, + "learning_rate": 0.0001965126355422418, + "loss": 1.8018, + "step": 1088 + }, + { + "epoch": 0.34041888090028133, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001965062004358014, + "loss": 1.6674, + "step": 1089 + }, + { + "epoch": 0.34073147858705843, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001964997595031587, + "loss": 1.9538, + "step": 1090 + }, + { + "epoch": 0.3410440762738356, + "grad_norm": 0.203125, + "learning_rate": 0.00019649331274470256, + "loss": 1.8417, + "step": 1091 + }, + { + "epoch": 0.3413566739606127, + "grad_norm": 0.21484375, + "learning_rate": 0.00019648686016082216, + "loss": 2.0019, + "step": 1092 + }, + { + "epoch": 0.3416692716473898, + "grad_norm": 0.21484375, + "learning_rate": 0.00019648040175190707, + "loss": 1.7955, + "step": 1093 + }, + { + "epoch": 0.3419818693341669, + "grad_norm": 0.22265625, + "learning_rate": 0.00019647393751834718, + "loss": 1.6747, + "step": 1094 + }, + { + "epoch": 0.34229446702094407, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019646746746053274, + "loss": 1.7818, + "step": 1095 + }, + { + "epoch": 0.3426070647077212, + "grad_norm": 0.20703125, + "learning_rate": 0.00019646099157885437, + "loss": 1.7983, + "step": 1096 + }, + { + "epoch": 0.3429196623944983, + "grad_norm": 0.2265625, + "learning_rate": 0.00019645450987370298, + "loss": 1.677, + "step": 1097 + }, + { + "epoch": 0.3432322600812754, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019644802234546993, + "loss": 1.9241, + "step": 1098 + }, + { + "epoch": 0.3435448577680525, + "grad_norm": 0.212890625, + "learning_rate": 0.0001964415289945469, + "loss": 1.9008, + "step": 1099 + }, + { + "epoch": 0.34385745545482965, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019643502982132581, + "loss": 1.6438, + "step": 1100 + }, + { + "epoch": 0.34417005314160676, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001964285248261991, + "loss": 1.7665, + "step": 1101 + }, + { + "epoch": 0.34448265082838386, + "grad_norm": 0.2109375, + "learning_rate": 0.0001964220140095595, + "loss": 1.7259, + "step": 1102 + }, + { + "epoch": 0.34479524851516097, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019641549737180002, + "loss": 1.7119, + "step": 1103 + }, + { + "epoch": 0.34510784620193813, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019640897491331408, + "loss": 1.6551, + "step": 1104 + }, + { + "epoch": 0.34542044388871523, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001964024466344955, + "loss": 1.9882, + "step": 1105 + }, + { + "epoch": 0.34573304157549234, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019639591253573836, + "loss": 1.7573, + "step": 1106 + }, + { + "epoch": 0.34604563926226944, + "grad_norm": 0.208984375, + "learning_rate": 0.00019638937261743714, + "loss": 1.6814, + "step": 1107 + }, + { + "epoch": 0.34635823694904655, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019638282687998667, + "loss": 1.943, + "step": 1108 + }, + { + "epoch": 0.3466708346358237, + "grad_norm": 0.220703125, + "learning_rate": 0.00019637627532378212, + "loss": 1.6896, + "step": 1109 + }, + { + "epoch": 0.3469834323226008, + "grad_norm": 0.2197265625, + "learning_rate": 0.000196369717949219, + "loss": 1.8984, + "step": 1110 + }, + { + "epoch": 0.3472960300093779, + "grad_norm": 0.201171875, + "learning_rate": 0.00019636315475669324, + "loss": 1.4845, + "step": 1111 + }, + { + "epoch": 0.347608627696155, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019635658574660098, + "loss": 1.7234, + "step": 1112 + }, + { + "epoch": 0.3479212253829322, + "grad_norm": 0.201171875, + "learning_rate": 0.0001963500109193389, + "loss": 1.5583, + "step": 1113 + }, + { + "epoch": 0.3482338230697093, + "grad_norm": 0.220703125, + "learning_rate": 0.00019634343027530383, + "loss": 1.8789, + "step": 1114 + }, + { + "epoch": 0.3485464207564864, + "grad_norm": 0.21484375, + "learning_rate": 0.00019633684381489315, + "loss": 2.0262, + "step": 1115 + }, + { + "epoch": 0.3488590184432635, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019633025153850442, + "loss": 1.7877, + "step": 1116 + }, + { + "epoch": 0.34917161613004066, + "grad_norm": 0.216796875, + "learning_rate": 0.00019632365344653563, + "loss": 1.7381, + "step": 1117 + }, + { + "epoch": 0.34948421381681777, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019631704953938518, + "loss": 1.7758, + "step": 1118 + }, + { + "epoch": 0.3497968115035949, + "grad_norm": 0.212890625, + "learning_rate": 0.0001963104398174517, + "loss": 1.8063, + "step": 1119 + }, + { + "epoch": 0.350109409190372, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019630382428113417, + "loss": 1.8691, + "step": 1120 + }, + { + "epoch": 0.3504220068771491, + "grad_norm": 0.203125, + "learning_rate": 0.00019629720293083214, + "loss": 1.7844, + "step": 1121 + }, + { + "epoch": 0.35073460456392624, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019629057576694522, + "loss": 1.6097, + "step": 1122 + }, + { + "epoch": 0.35104720225070335, + "grad_norm": 0.21875, + "learning_rate": 0.00019628394278987355, + "loss": 1.9393, + "step": 1123 + }, + { + "epoch": 0.35135979993748045, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001962773040000175, + "loss": 1.7556, + "step": 1124 + }, + { + "epoch": 0.35167239762425756, + "grad_norm": 0.220703125, + "learning_rate": 0.000196270659397778, + "loss": 1.7145, + "step": 1125 + }, + { + "epoch": 0.3519849953110347, + "grad_norm": 0.220703125, + "learning_rate": 0.0001962640089835561, + "loss": 1.6505, + "step": 1126 + }, + { + "epoch": 0.3522975929978118, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019625735275775327, + "loss": 1.6953, + "step": 1127 + }, + { + "epoch": 0.35261019068458893, + "grad_norm": 0.224609375, + "learning_rate": 0.00019625069072077138, + "loss": 1.7897, + "step": 1128 + }, + { + "epoch": 0.35292278837136604, + "grad_norm": 0.2109375, + "learning_rate": 0.0001962440228730127, + "loss": 1.8916, + "step": 1129 + }, + { + "epoch": 0.3532353860581432, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019623734921487965, + "loss": 1.5444, + "step": 1130 + }, + { + "epoch": 0.3535479837449203, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019623066974677525, + "loss": 1.6391, + "step": 1131 + }, + { + "epoch": 0.3538605814316974, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019622398446910263, + "loss": 1.6171, + "step": 1132 + }, + { + "epoch": 0.3541731791184745, + "grad_norm": 0.216796875, + "learning_rate": 0.0001962172933822655, + "loss": 1.6352, + "step": 1133 + }, + { + "epoch": 0.3544857768052516, + "grad_norm": 0.220703125, + "learning_rate": 0.00019621059648666772, + "loss": 1.8147, + "step": 1134 + }, + { + "epoch": 0.3547983744920288, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019620389378271366, + "loss": 1.7773, + "step": 1135 + }, + { + "epoch": 0.3551109721788059, + "grad_norm": 0.212890625, + "learning_rate": 0.0001961971852708079, + "loss": 1.7441, + "step": 1136 + }, + { + "epoch": 0.355423569865583, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019619047095135553, + "loss": 1.9931, + "step": 1137 + }, + { + "epoch": 0.3557361675523601, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019618375082476182, + "loss": 1.6723, + "step": 1138 + }, + { + "epoch": 0.35604876523913725, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001961770248914325, + "loss": 1.8312, + "step": 1139 + }, + { + "epoch": 0.35636136292591436, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019617029315177365, + "loss": 2.0553, + "step": 1140 + }, + { + "epoch": 0.35667396061269147, + "grad_norm": 0.20703125, + "learning_rate": 0.00019616355560619163, + "loss": 1.6513, + "step": 1141 + }, + { + "epoch": 0.35698655829946857, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019615681225509325, + "loss": 1.8244, + "step": 1142 + }, + { + "epoch": 0.3572991559862457, + "grad_norm": 0.20703125, + "learning_rate": 0.00019615006309888552, + "loss": 1.9322, + "step": 1143 + }, + { + "epoch": 0.35761175367302284, + "grad_norm": 0.2138671875, + "learning_rate": 0.000196143308137976, + "loss": 1.7572, + "step": 1144 + }, + { + "epoch": 0.35792435135979994, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019613654737277245, + "loss": 1.5536, + "step": 1145 + }, + { + "epoch": 0.35823694904657705, + "grad_norm": 0.2197265625, + "learning_rate": 0.000196129780803683, + "loss": 1.9036, + "step": 1146 + }, + { + "epoch": 0.35854954673335415, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019612300843111622, + "loss": 2.1856, + "step": 1147 + }, + { + "epoch": 0.3588621444201313, + "grad_norm": 0.212890625, + "learning_rate": 0.0001961162302554809, + "loss": 1.6396, + "step": 1148 + }, + { + "epoch": 0.3591747421069084, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019610944627718627, + "loss": 1.8837, + "step": 1149 + }, + { + "epoch": 0.3594873397936855, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019610265649664193, + "loss": 1.7418, + "step": 1150 + }, + { + "epoch": 0.35979993748046263, + "grad_norm": 0.2109375, + "learning_rate": 0.00019609586091425774, + "loss": 1.8848, + "step": 1151 + }, + { + "epoch": 0.3601125351672398, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019608905953044396, + "loss": 1.4857, + "step": 1152 + }, + { + "epoch": 0.3604251328540169, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019608225234561121, + "loss": 1.6741, + "step": 1153 + }, + { + "epoch": 0.360737730540794, + "grad_norm": 0.19921875, + "learning_rate": 0.00019607543936017046, + "loss": 1.6363, + "step": 1154 + }, + { + "epoch": 0.3610503282275711, + "grad_norm": 0.2109375, + "learning_rate": 0.00019606862057453298, + "loss": 1.8323, + "step": 1155 + }, + { + "epoch": 0.3613629259143482, + "grad_norm": 0.21484375, + "learning_rate": 0.00019606179598911049, + "loss": 1.6778, + "step": 1156 + }, + { + "epoch": 0.36167552360112537, + "grad_norm": 0.208984375, + "learning_rate": 0.00019605496560431496, + "loss": 1.8691, + "step": 1157 + }, + { + "epoch": 0.3619881212879025, + "grad_norm": 0.2109375, + "learning_rate": 0.00019604812942055873, + "loss": 1.6175, + "step": 1158 + }, + { + "epoch": 0.3623007189746796, + "grad_norm": 0.212890625, + "learning_rate": 0.00019604128743825453, + "loss": 1.717, + "step": 1159 + }, + { + "epoch": 0.3626133166614567, + "grad_norm": 0.201171875, + "learning_rate": 0.00019603443965781543, + "loss": 1.773, + "step": 1160 + }, + { + "epoch": 0.36292591434823385, + "grad_norm": 0.212890625, + "learning_rate": 0.00019602758607965484, + "loss": 1.8844, + "step": 1161 + }, + { + "epoch": 0.36323851203501095, + "grad_norm": 0.2109375, + "learning_rate": 0.00019602072670418647, + "loss": 1.9545, + "step": 1162 + }, + { + "epoch": 0.36355110972178806, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019601386153182451, + "loss": 1.523, + "step": 1163 + }, + { + "epoch": 0.36386370740856516, + "grad_norm": 0.224609375, + "learning_rate": 0.00019600699056298337, + "loss": 2.0468, + "step": 1164 + }, + { + "epoch": 0.36417630509534227, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019600011379807786, + "loss": 1.9032, + "step": 1165 + }, + { + "epoch": 0.36448890278211943, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019599323123752315, + "loss": 1.3631, + "step": 1166 + }, + { + "epoch": 0.36480150046889653, + "grad_norm": 0.21875, + "learning_rate": 0.00019598634288173474, + "loss": 1.6805, + "step": 1167 + }, + { + "epoch": 0.36511409815567364, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019597944873112852, + "loss": 1.4813, + "step": 1168 + }, + { + "epoch": 0.36542669584245074, + "grad_norm": 0.216796875, + "learning_rate": 0.00019597254878612065, + "loss": 1.7945, + "step": 1169 + }, + { + "epoch": 0.3657392935292279, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001959656430471277, + "loss": 1.5851, + "step": 1170 + }, + { + "epoch": 0.366051891216005, + "grad_norm": 0.21875, + "learning_rate": 0.0001959587315145666, + "loss": 1.8493, + "step": 1171 + }, + { + "epoch": 0.3663644889027821, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001959518141888546, + "loss": 1.7852, + "step": 1172 + }, + { + "epoch": 0.3666770865895592, + "grad_norm": 0.22265625, + "learning_rate": 0.00019594489107040928, + "loss": 1.9668, + "step": 1173 + }, + { + "epoch": 0.3669896842763364, + "grad_norm": 0.216796875, + "learning_rate": 0.00019593796215964867, + "loss": 1.656, + "step": 1174 + }, + { + "epoch": 0.3673022819631135, + "grad_norm": 0.20703125, + "learning_rate": 0.000195931027456991, + "loss": 1.5947, + "step": 1175 + }, + { + "epoch": 0.3676148796498906, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019592408696285496, + "loss": 1.7685, + "step": 1176 + }, + { + "epoch": 0.3679274773366677, + "grad_norm": 0.220703125, + "learning_rate": 0.00019591714067765953, + "loss": 1.6027, + "step": 1177 + }, + { + "epoch": 0.3682400750234448, + "grad_norm": 0.205078125, + "learning_rate": 0.0001959101886018241, + "loss": 2.2013, + "step": 1178 + }, + { + "epoch": 0.36855267271022196, + "grad_norm": 0.208984375, + "learning_rate": 0.0001959032307357684, + "loss": 1.6995, + "step": 1179 + }, + { + "epoch": 0.36886527039699907, + "grad_norm": 0.20703125, + "learning_rate": 0.00019589626707991242, + "loss": 1.7104, + "step": 1180 + }, + { + "epoch": 0.3691778680837762, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019588929763467657, + "loss": 1.6798, + "step": 1181 + }, + { + "epoch": 0.3694904657705533, + "grad_norm": 0.20703125, + "learning_rate": 0.00019588232240048167, + "loss": 1.5464, + "step": 1182 + }, + { + "epoch": 0.36980306345733044, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001958753413777488, + "loss": 1.7789, + "step": 1183 + }, + { + "epoch": 0.37011566114410754, + "grad_norm": 0.2265625, + "learning_rate": 0.00019586835456689934, + "loss": 1.7634, + "step": 1184 + }, + { + "epoch": 0.37042825883088465, + "grad_norm": 0.2109375, + "learning_rate": 0.0001958613619683552, + "loss": 1.9015, + "step": 1185 + }, + { + "epoch": 0.37074085651766175, + "grad_norm": 0.318359375, + "learning_rate": 0.00019585436358253845, + "loss": 2.3964, + "step": 1186 + }, + { + "epoch": 0.37105345420443886, + "grad_norm": 0.216796875, + "learning_rate": 0.00019584735940987163, + "loss": 1.7068, + "step": 1187 + }, + { + "epoch": 0.371366051891216, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019584034945077758, + "loss": 1.9431, + "step": 1188 + }, + { + "epoch": 0.3716786495779931, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001958333337056795, + "loss": 1.6602, + "step": 1189 + }, + { + "epoch": 0.37199124726477023, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019582631217500093, + "loss": 1.9655, + "step": 1190 + }, + { + "epoch": 0.37230384495154734, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001958192848591658, + "loss": 1.7755, + "step": 1191 + }, + { + "epoch": 0.3726164426383245, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019581225175859833, + "loss": 1.7425, + "step": 1192 + }, + { + "epoch": 0.3729290403251016, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019580521287372317, + "loss": 1.8308, + "step": 1193 + }, + { + "epoch": 0.3732416380118787, + "grad_norm": 0.2265625, + "learning_rate": 0.00019579816820496516, + "loss": 1.7996, + "step": 1194 + }, + { + "epoch": 0.3735542356986558, + "grad_norm": 0.224609375, + "learning_rate": 0.0001957911177527497, + "loss": 1.8265, + "step": 1195 + }, + { + "epoch": 0.373866833385433, + "grad_norm": 0.21484375, + "learning_rate": 0.00019578406151750236, + "loss": 1.5686, + "step": 1196 + }, + { + "epoch": 0.3741794310722101, + "grad_norm": 0.212890625, + "learning_rate": 0.0001957769994996492, + "loss": 1.7951, + "step": 1197 + }, + { + "epoch": 0.3744920287589872, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019576993169961653, + "loss": 1.7821, + "step": 1198 + }, + { + "epoch": 0.3748046264457643, + "grad_norm": 0.2158203125, + "learning_rate": 0.000195762858117831, + "loss": 1.7286, + "step": 1199 + }, + { + "epoch": 0.3751172241325414, + "grad_norm": 0.212890625, + "learning_rate": 0.00019575577875471974, + "loss": 1.707, + "step": 1200 + }, + { + "epoch": 0.37542982181931855, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019574869361071006, + "loss": 1.9656, + "step": 1201 + }, + { + "epoch": 0.37574241950609566, + "grad_norm": 0.2265625, + "learning_rate": 0.00019574160268622976, + "loss": 1.7242, + "step": 1202 + }, + { + "epoch": 0.37605501719287276, + "grad_norm": 0.21484375, + "learning_rate": 0.00019573450598170687, + "loss": 1.7001, + "step": 1203 + }, + { + "epoch": 0.37636761487964987, + "grad_norm": 0.21875, + "learning_rate": 0.00019572740349756992, + "loss": 1.8952, + "step": 1204 + }, + { + "epoch": 0.37668021256642703, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019572029523424756, + "loss": 1.8052, + "step": 1205 + }, + { + "epoch": 0.37699281025320414, + "grad_norm": 0.21875, + "learning_rate": 0.00019571318119216904, + "loss": 1.8727, + "step": 1206 + }, + { + "epoch": 0.37730540793998124, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001957060613717638, + "loss": 1.6054, + "step": 1207 + }, + { + "epoch": 0.37761800562675835, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019569893577346168, + "loss": 1.8537, + "step": 1208 + }, + { + "epoch": 0.3779306033135355, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019569180439769283, + "loss": 1.6096, + "step": 1209 + }, + { + "epoch": 0.3782432010003126, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019568466724488782, + "loss": 1.9668, + "step": 1210 + }, + { + "epoch": 0.3785557986870897, + "grad_norm": 0.20703125, + "learning_rate": 0.00019567752431547754, + "loss": 1.6992, + "step": 1211 + }, + { + "epoch": 0.3788683963738668, + "grad_norm": 0.2109375, + "learning_rate": 0.00019567037560989315, + "loss": 1.6169, + "step": 1212 + }, + { + "epoch": 0.37918099406064393, + "grad_norm": 0.21875, + "learning_rate": 0.00019566322112856633, + "loss": 1.7126, + "step": 1213 + }, + { + "epoch": 0.3794935917474211, + "grad_norm": 0.203125, + "learning_rate": 0.0001956560608719289, + "loss": 1.6279, + "step": 1214 + }, + { + "epoch": 0.3798061894341982, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001956488948404132, + "loss": 2.0578, + "step": 1215 + }, + { + "epoch": 0.3801187871209753, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019564172303445182, + "loss": 1.7761, + "step": 1216 + }, + { + "epoch": 0.3804313848077524, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019563454545447773, + "loss": 1.6644, + "step": 1217 + }, + { + "epoch": 0.38074398249452956, + "grad_norm": 0.2109375, + "learning_rate": 0.00019562736210092428, + "loss": 1.8542, + "step": 1218 + }, + { + "epoch": 0.38105658018130667, + "grad_norm": 0.208984375, + "learning_rate": 0.0001956201729742251, + "loss": 1.7917, + "step": 1219 + }, + { + "epoch": 0.3813691778680838, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019561297807481427, + "loss": 1.8474, + "step": 1220 + }, + { + "epoch": 0.3816817755548609, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001956057774031261, + "loss": 1.627, + "step": 1221 + }, + { + "epoch": 0.381994373241638, + "grad_norm": 0.2109375, + "learning_rate": 0.00019559857095959528, + "loss": 1.6842, + "step": 1222 + }, + { + "epoch": 0.38230697092841515, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019559135874465695, + "loss": 1.7735, + "step": 1223 + }, + { + "epoch": 0.38261956861519225, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019558414075874646, + "loss": 1.8281, + "step": 1224 + }, + { + "epoch": 0.38293216630196936, + "grad_norm": 0.21484375, + "learning_rate": 0.00019557691700229957, + "loss": 1.5633, + "step": 1225 + }, + { + "epoch": 0.38324476398874646, + "grad_norm": 0.212890625, + "learning_rate": 0.00019556968747575244, + "loss": 1.8649, + "step": 1226 + }, + { + "epoch": 0.3835573616755236, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019556245217954149, + "loss": 1.6938, + "step": 1227 + }, + { + "epoch": 0.38386995936230073, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001955552111141035, + "loss": 1.6866, + "step": 1228 + }, + { + "epoch": 0.38418255704907783, + "grad_norm": 0.232421875, + "learning_rate": 0.00019554796427987566, + "loss": 1.9343, + "step": 1229 + }, + { + "epoch": 0.38449515473585494, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019554071167729545, + "loss": 1.9785, + "step": 1230 + }, + { + "epoch": 0.3848077524226321, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019553345330680077, + "loss": 1.876, + "step": 1231 + }, + { + "epoch": 0.3851203501094092, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019552618916882973, + "loss": 1.671, + "step": 1232 + }, + { + "epoch": 0.3854329477961863, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019551891926382093, + "loss": 1.6575, + "step": 1233 + }, + { + "epoch": 0.3857455454829634, + "grad_norm": 0.216796875, + "learning_rate": 0.00019551164359221326, + "loss": 1.9775, + "step": 1234 + }, + { + "epoch": 0.3860581431697405, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019550436215444594, + "loss": 1.7329, + "step": 1235 + }, + { + "epoch": 0.3863707408565177, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001954970749509586, + "loss": 1.6745, + "step": 1236 + }, + { + "epoch": 0.3866833385432948, + "grad_norm": 0.21484375, + "learning_rate": 0.00019548978198219113, + "loss": 1.7502, + "step": 1237 + }, + { + "epoch": 0.3869959362300719, + "grad_norm": 0.228515625, + "learning_rate": 0.00019548248324858386, + "loss": 1.6299, + "step": 1238 + }, + { + "epoch": 0.387308533916849, + "grad_norm": 0.21875, + "learning_rate": 0.00019547517875057738, + "loss": 1.6477, + "step": 1239 + }, + { + "epoch": 0.38762113160362616, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019546786848861268, + "loss": 1.8717, + "step": 1240 + }, + { + "epoch": 0.38793372929040326, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019546055246313113, + "loss": 1.5382, + "step": 1241 + }, + { + "epoch": 0.38824632697718037, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019545323067457439, + "loss": 2.0394, + "step": 1242 + }, + { + "epoch": 0.3885589246639575, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019544590312338444, + "loss": 1.8064, + "step": 1243 + }, + { + "epoch": 0.3888715223507346, + "grad_norm": 0.2265625, + "learning_rate": 0.00019543856981000371, + "loss": 1.6846, + "step": 1244 + }, + { + "epoch": 0.38918412003751174, + "grad_norm": 0.203125, + "learning_rate": 0.0001954312307348749, + "loss": 1.7834, + "step": 1245 + }, + { + "epoch": 0.38949671772428884, + "grad_norm": 0.21484375, + "learning_rate": 0.0001954238858984411, + "loss": 1.8043, + "step": 1246 + }, + { + "epoch": 0.38980931541106595, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019541653530114568, + "loss": 1.7905, + "step": 1247 + }, + { + "epoch": 0.39012191309784305, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019540917894343246, + "loss": 1.6521, + "step": 1248 + }, + { + "epoch": 0.3904345107846202, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019540181682574552, + "loss": 1.6881, + "step": 1249 + }, + { + "epoch": 0.3907471084713973, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001953944489485293, + "loss": 2.0565, + "step": 1250 + }, + { + "epoch": 0.3910597061581744, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019538707531222867, + "loss": 1.7884, + "step": 1251 + }, + { + "epoch": 0.39137230384495153, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019537969591728872, + "loss": 1.5153, + "step": 1252 + }, + { + "epoch": 0.3916849015317287, + "grad_norm": 0.220703125, + "learning_rate": 0.000195372310764155, + "loss": 1.8401, + "step": 1253 + }, + { + "epoch": 0.3919974992185058, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019536491985327334, + "loss": 1.5898, + "step": 1254 + }, + { + "epoch": 0.3923100969052829, + "grad_norm": 0.208984375, + "learning_rate": 0.00019535752318508998, + "loss": 1.8118, + "step": 1255 + }, + { + "epoch": 0.39262269459206, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019535012076005138, + "loss": 1.4033, + "step": 1256 + }, + { + "epoch": 0.3929352922788371, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019534271257860448, + "loss": 1.672, + "step": 1257 + }, + { + "epoch": 0.3932478899656143, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019533529864119658, + "loss": 1.752, + "step": 1258 + }, + { + "epoch": 0.3935604876523914, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001953278789482752, + "loss": 1.3813, + "step": 1259 + }, + { + "epoch": 0.3938730853391685, + "grad_norm": 0.2265625, + "learning_rate": 0.00019532045350028826, + "loss": 1.8827, + "step": 1260 + }, + { + "epoch": 0.3941856830259456, + "grad_norm": 0.224609375, + "learning_rate": 0.00019531302229768404, + "loss": 1.9363, + "step": 1261 + }, + { + "epoch": 0.39449828071272275, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019530558534091127, + "loss": 1.8975, + "step": 1262 + }, + { + "epoch": 0.39481087839949985, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019529814263041884, + "loss": 1.7931, + "step": 1263 + }, + { + "epoch": 0.39512347608627696, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001952906941666561, + "loss": 1.7258, + "step": 1264 + }, + { + "epoch": 0.39543607377305406, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001952832399500727, + "loss": 1.8547, + "step": 1265 + }, + { + "epoch": 0.3957486714598312, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019527577998111874, + "loss": 1.7344, + "step": 1266 + }, + { + "epoch": 0.39606126914660833, + "grad_norm": 0.2109375, + "learning_rate": 0.0001952683142602445, + "loss": 1.7313, + "step": 1267 + }, + { + "epoch": 0.39637386683338544, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019526084278790074, + "loss": 1.8261, + "step": 1268 + }, + { + "epoch": 0.39668646452016254, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019525336556453852, + "loss": 1.7306, + "step": 1269 + }, + { + "epoch": 0.39699906220693965, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001952458825906092, + "loss": 1.9536, + "step": 1270 + }, + { + "epoch": 0.3973116598937168, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019523839386656458, + "loss": 1.7486, + "step": 1271 + }, + { + "epoch": 0.3976242575804939, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019523089939285675, + "loss": 1.9232, + "step": 1272 + }, + { + "epoch": 0.397936855267271, + "grad_norm": 0.220703125, + "learning_rate": 0.0001952233991699382, + "loss": 1.5959, + "step": 1273 + }, + { + "epoch": 0.3982494529540481, + "grad_norm": 0.224609375, + "learning_rate": 0.00019521589319826168, + "loss": 1.9811, + "step": 1274 + }, + { + "epoch": 0.3985620506408253, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019520838147828035, + "loss": 1.6908, + "step": 1275 + }, + { + "epoch": 0.3988746483276024, + "grad_norm": 0.208984375, + "learning_rate": 0.00019520086401044772, + "loss": 1.7011, + "step": 1276 + }, + { + "epoch": 0.3991872460143795, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001951933407952176, + "loss": 1.6478, + "step": 1277 + }, + { + "epoch": 0.3994998437011566, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001951858118330442, + "loss": 1.5169, + "step": 1278 + }, + { + "epoch": 0.3998124413879337, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019517827712438207, + "loss": 1.7061, + "step": 1279 + }, + { + "epoch": 0.40012503907471086, + "grad_norm": 0.212890625, + "learning_rate": 0.00019517073666968604, + "loss": 1.7499, + "step": 1280 + }, + { + "epoch": 0.40043763676148797, + "grad_norm": 0.212890625, + "learning_rate": 0.00019516319046941134, + "loss": 2.132, + "step": 1281 + }, + { + "epoch": 0.4007502344482651, + "grad_norm": 0.20703125, + "learning_rate": 0.00019515563852401358, + "loss": 1.56, + "step": 1282 + }, + { + "epoch": 0.4010628321350422, + "grad_norm": 0.216796875, + "learning_rate": 0.00019514808083394866, + "loss": 1.86, + "step": 1283 + }, + { + "epoch": 0.40137542982181934, + "grad_norm": 0.22265625, + "learning_rate": 0.00019514051739967286, + "loss": 1.6877, + "step": 1284 + }, + { + "epoch": 0.40168802750859645, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019513294822164277, + "loss": 1.5612, + "step": 1285 + }, + { + "epoch": 0.40200062519537355, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019512537330031537, + "loss": 1.7812, + "step": 1286 + }, + { + "epoch": 0.40231322288215066, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019511779263614798, + "loss": 1.5228, + "step": 1287 + }, + { + "epoch": 0.4026258205689278, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019511020622959823, + "loss": 1.4276, + "step": 1288 + }, + { + "epoch": 0.4029384182557049, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019510261408112414, + "loss": 1.8561, + "step": 1289 + }, + { + "epoch": 0.403251015942482, + "grad_norm": 0.21875, + "learning_rate": 0.00019509501619118403, + "loss": 1.8674, + "step": 1290 + }, + { + "epoch": 0.40356361362925913, + "grad_norm": 0.20703125, + "learning_rate": 0.0001950874125602366, + "loss": 1.8583, + "step": 1291 + }, + { + "epoch": 0.40387621131603624, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019507980318874096, + "loss": 1.686, + "step": 1292 + }, + { + "epoch": 0.4041888090028134, + "grad_norm": 0.21484375, + "learning_rate": 0.00019507218807715638, + "loss": 1.7897, + "step": 1293 + }, + { + "epoch": 0.4045014066895905, + "grad_norm": 0.228515625, + "learning_rate": 0.00019506456722594265, + "loss": 1.7626, + "step": 1294 + }, + { + "epoch": 0.4048140043763676, + "grad_norm": 0.212890625, + "learning_rate": 0.0001950569406355599, + "loss": 1.9098, + "step": 1295 + }, + { + "epoch": 0.4051266020631447, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001950493083064685, + "loss": 1.5848, + "step": 1296 + }, + { + "epoch": 0.4054391997499219, + "grad_norm": 0.220703125, + "learning_rate": 0.00019504167023912922, + "loss": 1.6362, + "step": 1297 + }, + { + "epoch": 0.405751797436699, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001950340264340032, + "loss": 1.9604, + "step": 1298 + }, + { + "epoch": 0.4060643951234761, + "grad_norm": 0.306640625, + "learning_rate": 0.0001950263768915519, + "loss": 2.5325, + "step": 1299 + }, + { + "epoch": 0.4063769928102532, + "grad_norm": 0.21484375, + "learning_rate": 0.00019501872161223712, + "loss": 1.9979, + "step": 1300 + }, + { + "epoch": 0.4066895904970303, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019501106059652108, + "loss": 1.714, + "step": 1301 + }, + { + "epoch": 0.40700218818380746, + "grad_norm": 0.220703125, + "learning_rate": 0.0001950033938448662, + "loss": 1.7827, + "step": 1302 + }, + { + "epoch": 0.40731478587058456, + "grad_norm": 0.21484375, + "learning_rate": 0.00019499572135773537, + "loss": 1.6062, + "step": 1303 + }, + { + "epoch": 0.40762738355736167, + "grad_norm": 0.21484375, + "learning_rate": 0.0001949880431355918, + "loss": 1.6599, + "step": 1304 + }, + { + "epoch": 0.40793998124413877, + "grad_norm": 0.2197265625, + "learning_rate": 0.000194980359178899, + "loss": 1.5345, + "step": 1305 + }, + { + "epoch": 0.40825257893091593, + "grad_norm": 0.220703125, + "learning_rate": 0.0001949726694881209, + "loss": 1.8149, + "step": 1306 + }, + { + "epoch": 0.40856517661769304, + "grad_norm": 0.220703125, + "learning_rate": 0.00019496497406372174, + "loss": 1.6207, + "step": 1307 + }, + { + "epoch": 0.40887777430447014, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019495727290616606, + "loss": 1.7058, + "step": 1308 + }, + { + "epoch": 0.40919037199124725, + "grad_norm": 0.216796875, + "learning_rate": 0.0001949495660159188, + "loss": 1.5045, + "step": 1309 + }, + { + "epoch": 0.4095029696780244, + "grad_norm": 0.21875, + "learning_rate": 0.00019494185339344523, + "loss": 1.8221, + "step": 1310 + }, + { + "epoch": 0.4098155673648015, + "grad_norm": 0.224609375, + "learning_rate": 0.000194934135039211, + "loss": 1.4478, + "step": 1311 + }, + { + "epoch": 0.4101281650515786, + "grad_norm": 0.228515625, + "learning_rate": 0.0001949264109536821, + "loss": 1.4922, + "step": 1312 + }, + { + "epoch": 0.4104407627383557, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019491868113732474, + "loss": 1.8462, + "step": 1313 + }, + { + "epoch": 0.41075336042513283, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001949109455906057, + "loss": 1.831, + "step": 1314 + }, + { + "epoch": 0.41106595811191, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001949032043139919, + "loss": 1.5742, + "step": 1315 + }, + { + "epoch": 0.4113785557986871, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001948954573079507, + "loss": 1.7099, + "step": 1316 + }, + { + "epoch": 0.4116911534854642, + "grad_norm": 0.21875, + "learning_rate": 0.00019488770457294985, + "loss": 1.8771, + "step": 1317 + }, + { + "epoch": 0.4120037511722413, + "grad_norm": 0.21875, + "learning_rate": 0.00019487994610945734, + "loss": 1.9056, + "step": 1318 + }, + { + "epoch": 0.41231634885901847, + "grad_norm": 0.20703125, + "learning_rate": 0.00019487218191794158, + "loss": 1.7384, + "step": 1319 + }, + { + "epoch": 0.41262894654579557, + "grad_norm": 0.212890625, + "learning_rate": 0.00019486441199887132, + "loss": 1.9079, + "step": 1320 + }, + { + "epoch": 0.4129415442325727, + "grad_norm": 0.224609375, + "learning_rate": 0.00019485663635271562, + "loss": 1.8313, + "step": 1321 + }, + { + "epoch": 0.4132541419193498, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019484885497994387, + "loss": 1.642, + "step": 1322 + }, + { + "epoch": 0.4135667396061269, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019484106788102593, + "loss": 1.7165, + "step": 1323 + }, + { + "epoch": 0.41387933729290405, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001948332750564318, + "loss": 1.6474, + "step": 1324 + }, + { + "epoch": 0.41419193497968115, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019482547650663206, + "loss": 1.5541, + "step": 1325 + }, + { + "epoch": 0.41450453266645826, + "grad_norm": 0.20703125, + "learning_rate": 0.00019481767223209745, + "loss": 2.0118, + "step": 1326 + }, + { + "epoch": 0.41481713035323536, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019480986223329913, + "loss": 1.8306, + "step": 1327 + }, + { + "epoch": 0.4151297280400125, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019480204651070864, + "loss": 1.6828, + "step": 1328 + }, + { + "epoch": 0.41544232572678963, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019479422506479775, + "loss": 1.6071, + "step": 1329 + }, + { + "epoch": 0.41575492341356673, + "grad_norm": 0.203125, + "learning_rate": 0.00019478639789603872, + "loss": 1.6847, + "step": 1330 + }, + { + "epoch": 0.41606752110034384, + "grad_norm": 0.21484375, + "learning_rate": 0.00019477856500490405, + "loss": 1.6309, + "step": 1331 + }, + { + "epoch": 0.416380118787121, + "grad_norm": 0.21484375, + "learning_rate": 0.00019477072639186664, + "loss": 1.9451, + "step": 1332 + }, + { + "epoch": 0.4166927164738981, + "grad_norm": 0.220703125, + "learning_rate": 0.0001947628820573997, + "loss": 1.8675, + "step": 1333 + }, + { + "epoch": 0.4170053141606752, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019475503200197685, + "loss": 1.5601, + "step": 1334 + }, + { + "epoch": 0.4173179118474523, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019474717622607195, + "loss": 1.5294, + "step": 1335 + }, + { + "epoch": 0.4176305095342294, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019473931473015926, + "loss": 1.7433, + "step": 1336 + }, + { + "epoch": 0.4179431072210066, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019473144751471345, + "loss": 1.6771, + "step": 1337 + }, + { + "epoch": 0.4182557049077837, + "grad_norm": 0.212890625, + "learning_rate": 0.0001947235745802094, + "loss": 1.9994, + "step": 1338 + }, + { + "epoch": 0.4185683025945608, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001947156959271225, + "loss": 1.726, + "step": 1339 + }, + { + "epoch": 0.4188809002813379, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019470781155592827, + "loss": 1.8079, + "step": 1340 + }, + { + "epoch": 0.41919349796811506, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019469992146710282, + "loss": 1.8046, + "step": 1341 + }, + { + "epoch": 0.41950609565489216, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001946920256611224, + "loss": 1.619, + "step": 1342 + }, + { + "epoch": 0.41981869334166927, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019468412413846373, + "loss": 1.6015, + "step": 1343 + }, + { + "epoch": 0.4201312910284464, + "grad_norm": 0.212890625, + "learning_rate": 0.00019467621689960385, + "loss": 1.7538, + "step": 1344 + }, + { + "epoch": 0.42044388871522353, + "grad_norm": 0.20703125, + "learning_rate": 0.00019466830394502009, + "loss": 1.8732, + "step": 1345 + }, + { + "epoch": 0.42075648640200064, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001946603852751902, + "loss": 1.7492, + "step": 1346 + }, + { + "epoch": 0.42106908408877775, + "grad_norm": 0.224609375, + "learning_rate": 0.0001946524608905922, + "loss": 1.6893, + "step": 1347 + }, + { + "epoch": 0.42138168177555485, + "grad_norm": 0.220703125, + "learning_rate": 0.00019464453079170454, + "loss": 1.5848, + "step": 1348 + }, + { + "epoch": 0.42169427946233196, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019463659497900593, + "loss": 1.5974, + "step": 1349 + }, + { + "epoch": 0.4220068771491091, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001946286534529755, + "loss": 1.9757, + "step": 1350 + }, + { + "epoch": 0.4223194748358862, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001946207062140927, + "loss": 1.9514, + "step": 1351 + }, + { + "epoch": 0.4226320725226633, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019461275326283724, + "loss": 1.894, + "step": 1352 + }, + { + "epoch": 0.42294467020944043, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019460479459968932, + "loss": 1.5872, + "step": 1353 + }, + { + "epoch": 0.4232572678962176, + "grad_norm": 0.21484375, + "learning_rate": 0.0001945968302251294, + "loss": 1.5275, + "step": 1354 + }, + { + "epoch": 0.4235698655829947, + "grad_norm": 0.220703125, + "learning_rate": 0.0001945888601396383, + "loss": 1.6427, + "step": 1355 + }, + { + "epoch": 0.4238824632697718, + "grad_norm": 0.21875, + "learning_rate": 0.00019458088434369715, + "loss": 1.6407, + "step": 1356 + }, + { + "epoch": 0.4241950609565489, + "grad_norm": 0.224609375, + "learning_rate": 0.00019457290283778747, + "loss": 1.9373, + "step": 1357 + }, + { + "epoch": 0.424507658643326, + "grad_norm": 0.21484375, + "learning_rate": 0.0001945649156223912, + "loss": 1.7385, + "step": 1358 + }, + { + "epoch": 0.4248202563301032, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001945569226979904, + "loss": 1.8262, + "step": 1359 + }, + { + "epoch": 0.4251328540168803, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019454892406506775, + "loss": 1.6286, + "step": 1360 + }, + { + "epoch": 0.4254454517036574, + "grad_norm": 0.22265625, + "learning_rate": 0.00019454091972410603, + "loss": 1.7992, + "step": 1361 + }, + { + "epoch": 0.4257580493904345, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001945329096755885, + "loss": 1.9609, + "step": 1362 + }, + { + "epoch": 0.42607064707721165, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019452489391999874, + "loss": 1.9051, + "step": 1363 + }, + { + "epoch": 0.42638324476398876, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019451687245782072, + "loss": 1.7331, + "step": 1364 + }, + { + "epoch": 0.42669584245076586, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019450884528953864, + "loss": 2.1455, + "step": 1365 + }, + { + "epoch": 0.42700844013754297, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019450081241563716, + "loss": 1.8298, + "step": 1366 + }, + { + "epoch": 0.4273210378243201, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019449277383660118, + "loss": 1.8084, + "step": 1367 + }, + { + "epoch": 0.42763363551109723, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019448472955291605, + "loss": 1.6876, + "step": 1368 + }, + { + "epoch": 0.42794623319787434, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001944766795650674, + "loss": 1.7431, + "step": 1369 + }, + { + "epoch": 0.42825883088465144, + "grad_norm": 0.228515625, + "learning_rate": 0.0001944686238735412, + "loss": 1.7904, + "step": 1370 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019446056247882378, + "loss": 1.8465, + "step": 1371 + }, + { + "epoch": 0.4288840262582057, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019445249538140185, + "loss": 1.6672, + "step": 1372 + }, + { + "epoch": 0.4291966239449828, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001944444225817624, + "loss": 1.9209, + "step": 1373 + }, + { + "epoch": 0.4295092216317599, + "grad_norm": 0.220703125, + "learning_rate": 0.00019443634408039282, + "loss": 1.8336, + "step": 1374 + }, + { + "epoch": 0.429821819318537, + "grad_norm": 0.22265625, + "learning_rate": 0.0001944282598777808, + "loss": 1.9261, + "step": 1375 + }, + { + "epoch": 0.4301344170053142, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001944201699744144, + "loss": 1.6371, + "step": 1376 + }, + { + "epoch": 0.4304470146920913, + "grad_norm": 0.220703125, + "learning_rate": 0.00019441207437078203, + "loss": 1.4774, + "step": 1377 + }, + { + "epoch": 0.4307596123788684, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001944039730673724, + "loss": 1.5849, + "step": 1378 + }, + { + "epoch": 0.4310722100656455, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001943958660646746, + "loss": 1.8103, + "step": 1379 + }, + { + "epoch": 0.4313848077524226, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019438775336317812, + "loss": 1.8946, + "step": 1380 + }, + { + "epoch": 0.43169740543919977, + "grad_norm": 0.20703125, + "learning_rate": 0.00019437963496337266, + "loss": 1.6056, + "step": 1381 + }, + { + "epoch": 0.43201000312597687, + "grad_norm": 0.220703125, + "learning_rate": 0.00019437151086574837, + "loss": 1.6991, + "step": 1382 + }, + { + "epoch": 0.432322600812754, + "grad_norm": 0.2265625, + "learning_rate": 0.00019436338107079574, + "loss": 1.6126, + "step": 1383 + }, + { + "epoch": 0.4326351984995311, + "grad_norm": 0.216796875, + "learning_rate": 0.00019435524557900551, + "loss": 1.4967, + "step": 1384 + }, + { + "epoch": 0.43294779618630824, + "grad_norm": 0.212890625, + "learning_rate": 0.00019434710439086888, + "loss": 1.5868, + "step": 1385 + }, + { + "epoch": 0.43326039387308535, + "grad_norm": 0.2265625, + "learning_rate": 0.00019433895750687734, + "loss": 1.7528, + "step": 1386 + }, + { + "epoch": 0.43357299155986245, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019433080492752268, + "loss": 1.899, + "step": 1387 + }, + { + "epoch": 0.43388558924663956, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019432264665329715, + "loss": 2.0873, + "step": 1388 + }, + { + "epoch": 0.4341981869334167, + "grad_norm": 0.216796875, + "learning_rate": 0.00019431448268469325, + "loss": 1.4453, + "step": 1389 + }, + { + "epoch": 0.4345107846201938, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019430631302220385, + "loss": 1.9314, + "step": 1390 + }, + { + "epoch": 0.43482338230697093, + "grad_norm": 0.21875, + "learning_rate": 0.0001942981376663221, + "loss": 1.5989, + "step": 1391 + }, + { + "epoch": 0.43513597999374803, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019428995661754171, + "loss": 1.8037, + "step": 1392 + }, + { + "epoch": 0.43544857768052514, + "grad_norm": 0.20703125, + "learning_rate": 0.0001942817698763564, + "loss": 1.7903, + "step": 1393 + }, + { + "epoch": 0.4357611753673023, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019427357744326057, + "loss": 1.7809, + "step": 1394 + }, + { + "epoch": 0.4360737730540794, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001942653793187487, + "loss": 1.552, + "step": 1395 + }, + { + "epoch": 0.4363863707408565, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019425717550331572, + "loss": 1.7079, + "step": 1396 + }, + { + "epoch": 0.4366989684276336, + "grad_norm": 0.2119140625, + "learning_rate": 0.000194248965997457, + "loss": 1.8321, + "step": 1397 + }, + { + "epoch": 0.4370115661144108, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019424075080166805, + "loss": 1.6185, + "step": 1398 + }, + { + "epoch": 0.4373241638011879, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019423252991644492, + "loss": 1.7149, + "step": 1399 + }, + { + "epoch": 0.437636761487965, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019422430334228386, + "loss": 1.7048, + "step": 1400 + }, + { + "epoch": 0.4379493591747421, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019421607107968154, + "loss": 1.8062, + "step": 1401 + }, + { + "epoch": 0.4382619568615192, + "grad_norm": 0.2265625, + "learning_rate": 0.00019420783312913494, + "loss": 1.8332, + "step": 1402 + }, + { + "epoch": 0.43857455454829636, + "grad_norm": 0.306640625, + "learning_rate": 0.0001941995894911414, + "loss": 2.397, + "step": 1403 + }, + { + "epoch": 0.43888715223507346, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019419134016619865, + "loss": 1.6672, + "step": 1404 + }, + { + "epoch": 0.43919974992185057, + "grad_norm": 0.2265625, + "learning_rate": 0.0001941830851548046, + "loss": 1.6112, + "step": 1405 + }, + { + "epoch": 0.4395123476086277, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001941748244574577, + "loss": 1.7182, + "step": 1406 + }, + { + "epoch": 0.43982494529540483, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019416655807465667, + "loss": 1.7438, + "step": 1407 + }, + { + "epoch": 0.44013754298218194, + "grad_norm": 0.216796875, + "learning_rate": 0.0001941582860069005, + "loss": 1.8327, + "step": 1408 + }, + { + "epoch": 0.44045014066895904, + "grad_norm": 0.224609375, + "learning_rate": 0.00019415000825468863, + "loss": 2.0563, + "step": 1409 + }, + { + "epoch": 0.44076273835573615, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001941417248185208, + "loss": 1.9451, + "step": 1410 + }, + { + "epoch": 0.4410753360425133, + "grad_norm": 0.224609375, + "learning_rate": 0.00019413343569889702, + "loss": 1.8786, + "step": 1411 + }, + { + "epoch": 0.4413879337292904, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019412514089631785, + "loss": 1.7905, + "step": 1412 + }, + { + "epoch": 0.4417005314160675, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019411684041128392, + "loss": 1.7573, + "step": 1413 + }, + { + "epoch": 0.4420131291028446, + "grad_norm": 0.220703125, + "learning_rate": 0.00019410853424429642, + "loss": 1.6898, + "step": 1414 + }, + { + "epoch": 0.44232572678962173, + "grad_norm": 0.220703125, + "learning_rate": 0.00019410022239585678, + "loss": 1.7676, + "step": 1415 + }, + { + "epoch": 0.4426383244763989, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001940919048664668, + "loss": 1.7774, + "step": 1416 + }, + { + "epoch": 0.442950922163176, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019408358165662866, + "loss": 1.6328, + "step": 1417 + }, + { + "epoch": 0.4432635198499531, + "grad_norm": 0.2265625, + "learning_rate": 0.00019407525276684474, + "loss": 1.7037, + "step": 1418 + }, + { + "epoch": 0.4435761175367302, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019406691819761796, + "loss": 1.81, + "step": 1419 + }, + { + "epoch": 0.44388871522350737, + "grad_norm": 0.2421875, + "learning_rate": 0.00019405857794945147, + "loss": 1.8474, + "step": 1420 + }, + { + "epoch": 0.4442013129102845, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019405023202284874, + "loss": 1.6398, + "step": 1421 + }, + { + "epoch": 0.4445139105970616, + "grad_norm": 0.22265625, + "learning_rate": 0.0001940418804183137, + "loss": 1.5592, + "step": 1422 + }, + { + "epoch": 0.4448265082838387, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019403352313635046, + "loss": 1.6566, + "step": 1423 + }, + { + "epoch": 0.44513910597061584, + "grad_norm": 0.21484375, + "learning_rate": 0.0001940251601774636, + "loss": 1.6928, + "step": 1424 + }, + { + "epoch": 0.44545170365739295, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019401679154215802, + "loss": 2.029, + "step": 1425 + }, + { + "epoch": 0.44576430134417006, + "grad_norm": 0.220703125, + "learning_rate": 0.0001940084172309389, + "loss": 1.9225, + "step": 1426 + }, + { + "epoch": 0.44607689903094716, + "grad_norm": 0.2431640625, + "learning_rate": 0.00019400003724431185, + "loss": 1.9033, + "step": 1427 + }, + { + "epoch": 0.44638949671772427, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019399165158278279, + "loss": 1.9373, + "step": 1428 + }, + { + "epoch": 0.4467020944045014, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019398326024685792, + "loss": 1.8287, + "step": 1429 + }, + { + "epoch": 0.44701469209127853, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019397486323704388, + "loss": 1.4876, + "step": 1430 + }, + { + "epoch": 0.44732728977805564, + "grad_norm": 0.2109375, + "learning_rate": 0.0001939664605538476, + "loss": 1.7532, + "step": 1431 + }, + { + "epoch": 0.44763988746483274, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001939580521977763, + "loss": 1.8811, + "step": 1432 + }, + { + "epoch": 0.4479524851516099, + "grad_norm": 0.22265625, + "learning_rate": 0.00019394963816933772, + "loss": 1.8956, + "step": 1433 + }, + { + "epoch": 0.448265082838387, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019394121846903975, + "loss": 1.7634, + "step": 1434 + }, + { + "epoch": 0.4485776805251641, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001939327930973907, + "loss": 1.5284, + "step": 1435 + }, + { + "epoch": 0.4488902782119412, + "grad_norm": 0.234375, + "learning_rate": 0.00019392436205489924, + "loss": 1.8581, + "step": 1436 + }, + { + "epoch": 0.4492028758987183, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019391592534207436, + "loss": 1.4981, + "step": 1437 + }, + { + "epoch": 0.4495154735854955, + "grad_norm": 0.220703125, + "learning_rate": 0.00019390748295942535, + "loss": 1.6315, + "step": 1438 + }, + { + "epoch": 0.4498280712722726, + "grad_norm": 0.216796875, + "learning_rate": 0.00019389903490746194, + "loss": 1.755, + "step": 1439 + }, + { + "epoch": 0.4501406689590497, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019389058118669418, + "loss": 1.6564, + "step": 1440 + }, + { + "epoch": 0.4504532666458268, + "grad_norm": 0.232421875, + "learning_rate": 0.00019388212179763235, + "loss": 1.8079, + "step": 1441 + }, + { + "epoch": 0.45076586433260396, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001938736567407872, + "loss": 1.7621, + "step": 1442 + }, + { + "epoch": 0.45107846201938107, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019386518601666977, + "loss": 2.0246, + "step": 1443 + }, + { + "epoch": 0.45139105970615817, + "grad_norm": 0.228515625, + "learning_rate": 0.0001938567096257914, + "loss": 1.7006, + "step": 1444 + }, + { + "epoch": 0.4517036573929353, + "grad_norm": 0.23046875, + "learning_rate": 0.00019384822756866394, + "loss": 1.7433, + "step": 1445 + }, + { + "epoch": 0.45201625507971244, + "grad_norm": 0.220703125, + "learning_rate": 0.00019383973984579936, + "loss": 1.6673, + "step": 1446 + }, + { + "epoch": 0.45232885276648954, + "grad_norm": 0.20703125, + "learning_rate": 0.00019383124645771008, + "loss": 1.7402, + "step": 1447 + }, + { + "epoch": 0.45264145045326665, + "grad_norm": 0.220703125, + "learning_rate": 0.00019382274740490892, + "loss": 1.7445, + "step": 1448 + }, + { + "epoch": 0.45295404814004375, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001938142426879089, + "loss": 1.752, + "step": 1449 + }, + { + "epoch": 0.45326664582682086, + "grad_norm": 0.224609375, + "learning_rate": 0.00019380573230722353, + "loss": 1.7653, + "step": 1450 + }, + { + "epoch": 0.453579243513598, + "grad_norm": 0.224609375, + "learning_rate": 0.00019379721626336656, + "loss": 1.4672, + "step": 1451 + }, + { + "epoch": 0.4538918412003751, + "grad_norm": 0.224609375, + "learning_rate": 0.0001937886945568521, + "loss": 1.6907, + "step": 1452 + }, + { + "epoch": 0.45420443888715223, + "grad_norm": 0.2265625, + "learning_rate": 0.00019378016718819466, + "loss": 1.7775, + "step": 1453 + }, + { + "epoch": 0.45451703657392933, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019377163415790902, + "loss": 1.913, + "step": 1454 + }, + { + "epoch": 0.4548296342607065, + "grad_norm": 0.216796875, + "learning_rate": 0.00019376309546651033, + "loss": 1.8471, + "step": 1455 + }, + { + "epoch": 0.4551422319474836, + "grad_norm": 0.228515625, + "learning_rate": 0.00019375455111451405, + "loss": 1.5682, + "step": 1456 + }, + { + "epoch": 0.4554548296342607, + "grad_norm": 0.220703125, + "learning_rate": 0.00019374600110243608, + "loss": 1.7008, + "step": 1457 + }, + { + "epoch": 0.4557674273210378, + "grad_norm": 0.21875, + "learning_rate": 0.00019373744543079257, + "loss": 1.7075, + "step": 1458 + }, + { + "epoch": 0.4560800250078149, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001937288841001, + "loss": 1.6143, + "step": 1459 + }, + { + "epoch": 0.4563926226945921, + "grad_norm": 0.21484375, + "learning_rate": 0.00019372031711087527, + "loss": 1.6665, + "step": 1460 + }, + { + "epoch": 0.4567052203813692, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019371174446363557, + "loss": 1.6533, + "step": 1461 + }, + { + "epoch": 0.4570178180681463, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019370316615889842, + "loss": 1.5501, + "step": 1462 + }, + { + "epoch": 0.4573304157549234, + "grad_norm": 0.232421875, + "learning_rate": 0.00019369458219718175, + "loss": 1.8101, + "step": 1463 + }, + { + "epoch": 0.45764301344170055, + "grad_norm": 0.216796875, + "learning_rate": 0.00019368599257900372, + "loss": 1.6708, + "step": 1464 + }, + { + "epoch": 0.45795561112847766, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019367739730488296, + "loss": 1.6922, + "step": 1465 + }, + { + "epoch": 0.45826820881525476, + "grad_norm": 0.220703125, + "learning_rate": 0.00019366879637533834, + "loss": 1.6808, + "step": 1466 + }, + { + "epoch": 0.45858080650203187, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019366018979088913, + "loss": 1.654, + "step": 1467 + }, + { + "epoch": 0.45889340418880903, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001936515775520549, + "loss": 1.7892, + "step": 1468 + }, + { + "epoch": 0.45920600187558613, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019364295965935562, + "loss": 1.6039, + "step": 1469 + }, + { + "epoch": 0.45951859956236324, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001936343361133115, + "loss": 1.6348, + "step": 1470 + }, + { + "epoch": 0.45983119724914034, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001936257069144432, + "loss": 2.0579, + "step": 1471 + }, + { + "epoch": 0.46014379493591745, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019361707206327168, + "loss": 1.5824, + "step": 1472 + }, + { + "epoch": 0.4604563926226946, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001936084315603182, + "loss": 1.6563, + "step": 1473 + }, + { + "epoch": 0.4607689903094717, + "grad_norm": 0.21484375, + "learning_rate": 0.0001935997854061044, + "loss": 1.7782, + "step": 1474 + }, + { + "epoch": 0.4610815879962488, + "grad_norm": 0.22265625, + "learning_rate": 0.00019359113360115234, + "loss": 1.7625, + "step": 1475 + }, + { + "epoch": 0.4613941856830259, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019358247614598427, + "loss": 1.5607, + "step": 1476 + }, + { + "epoch": 0.4617067833698031, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019357381304112281, + "loss": 1.6091, + "step": 1477 + }, + { + "epoch": 0.4620193810565802, + "grad_norm": 0.21875, + "learning_rate": 0.00019356514428709104, + "loss": 1.5822, + "step": 1478 + }, + { + "epoch": 0.4623319787433573, + "grad_norm": 0.21875, + "learning_rate": 0.0001935564698844123, + "loss": 1.8785, + "step": 1479 + }, + { + "epoch": 0.4626445764301344, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001935477898336102, + "loss": 1.4933, + "step": 1480 + }, + { + "epoch": 0.4629571741169115, + "grad_norm": 0.341796875, + "learning_rate": 0.00019353910413520887, + "loss": 2.2543, + "step": 1481 + }, + { + "epoch": 0.46326977180368867, + "grad_norm": 0.2060546875, + "learning_rate": 0.0001935304127897326, + "loss": 1.6022, + "step": 1482 + }, + { + "epoch": 0.4635823694904658, + "grad_norm": 0.224609375, + "learning_rate": 0.00019352171579770615, + "loss": 1.9542, + "step": 1483 + }, + { + "epoch": 0.4638949671772429, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019351301315965452, + "loss": 1.5863, + "step": 1484 + }, + { + "epoch": 0.46420756486402, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019350430487610312, + "loss": 1.9259, + "step": 1485 + }, + { + "epoch": 0.46452016255079714, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001934955909475777, + "loss": 1.9044, + "step": 1486 + }, + { + "epoch": 0.46483276023757425, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019348687137460432, + "loss": 1.829, + "step": 1487 + }, + { + "epoch": 0.46514535792435135, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019347814615770933, + "loss": 1.5524, + "step": 1488 + }, + { + "epoch": 0.46545795561112846, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019346941529741954, + "loss": 1.683, + "step": 1489 + }, + { + "epoch": 0.4657705532979056, + "grad_norm": 0.220703125, + "learning_rate": 0.0001934606787942621, + "loss": 1.8919, + "step": 1490 + }, + { + "epoch": 0.4660831509846827, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019345193664876433, + "loss": 1.7553, + "step": 1491 + }, + { + "epoch": 0.46639574867145983, + "grad_norm": 0.21875, + "learning_rate": 0.0001934431888614541, + "loss": 1.9543, + "step": 1492 + }, + { + "epoch": 0.46670834635823694, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019343443543285945, + "loss": 1.6919, + "step": 1493 + }, + { + "epoch": 0.46702094404501404, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019342567636350887, + "loss": 1.6121, + "step": 1494 + }, + { + "epoch": 0.4673335417317912, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019341691165393116, + "loss": 1.5772, + "step": 1495 + }, + { + "epoch": 0.4676461394185683, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019340814130465548, + "loss": 1.9449, + "step": 1496 + }, + { + "epoch": 0.4679587371053454, + "grad_norm": 0.224609375, + "learning_rate": 0.00019339936531621122, + "loss": 1.7063, + "step": 1497 + }, + { + "epoch": 0.4682713347921225, + "grad_norm": 0.212890625, + "learning_rate": 0.0001933905836891283, + "loss": 1.7768, + "step": 1498 + }, + { + "epoch": 0.4685839324788997, + "grad_norm": 0.21875, + "learning_rate": 0.00019338179642393685, + "loss": 1.7279, + "step": 1499 + }, + { + "epoch": 0.4688965301656768, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001933730035211673, + "loss": 1.7344, + "step": 1500 + }, + { + "epoch": 0.4692091278524539, + "grad_norm": 0.21484375, + "learning_rate": 0.00019336420498135057, + "loss": 1.6349, + "step": 1501 + }, + { + "epoch": 0.469521725539231, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001933554008050178, + "loss": 1.703, + "step": 1502 + }, + { + "epoch": 0.46983432322600815, + "grad_norm": 0.21875, + "learning_rate": 0.00019334659099270053, + "loss": 1.6039, + "step": 1503 + }, + { + "epoch": 0.47014692091278526, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001933377755449306, + "loss": 1.7018, + "step": 1504 + }, + { + "epoch": 0.47045951859956237, + "grad_norm": 0.22265625, + "learning_rate": 0.00019332895446224022, + "loss": 1.5957, + "step": 1505 + }, + { + "epoch": 0.47077211628633947, + "grad_norm": 0.2265625, + "learning_rate": 0.00019332012774516191, + "loss": 1.6054, + "step": 1506 + }, + { + "epoch": 0.4710847139731166, + "grad_norm": 0.216796875, + "learning_rate": 0.0001933112953942286, + "loss": 1.6822, + "step": 1507 + }, + { + "epoch": 0.47139731165989374, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019330245740997346, + "loss": 1.6045, + "step": 1508 + }, + { + "epoch": 0.47170990934667084, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019329361379293006, + "loss": 1.6817, + "step": 1509 + }, + { + "epoch": 0.47202250703344795, + "grad_norm": 0.26953125, + "learning_rate": 0.00019328476454363237, + "loss": 1.6334, + "step": 1510 + }, + { + "epoch": 0.47233510472022505, + "grad_norm": 0.236328125, + "learning_rate": 0.00019327590966261452, + "loss": 1.9416, + "step": 1511 + }, + { + "epoch": 0.4726477024070022, + "grad_norm": 0.21484375, + "learning_rate": 0.00019326704915041115, + "loss": 1.8148, + "step": 1512 + }, + { + "epoch": 0.4729603000937793, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001932581830075572, + "loss": 1.6804, + "step": 1513 + }, + { + "epoch": 0.4732728977805564, + "grad_norm": 0.224609375, + "learning_rate": 0.00019324931123458784, + "loss": 1.6578, + "step": 1514 + }, + { + "epoch": 0.47358549546733353, + "grad_norm": 0.232421875, + "learning_rate": 0.00019324043383203875, + "loss": 1.7513, + "step": 1515 + }, + { + "epoch": 0.47389809315411063, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019323155080044587, + "loss": 1.8009, + "step": 1516 + }, + { + "epoch": 0.4742106908408878, + "grad_norm": 0.23828125, + "learning_rate": 0.00019322266214034546, + "loss": 1.5399, + "step": 1517 + }, + { + "epoch": 0.4745232885276649, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019321376785227416, + "loss": 1.6751, + "step": 1518 + }, + { + "epoch": 0.474835886214442, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019320486793676889, + "loss": 1.5572, + "step": 1519 + }, + { + "epoch": 0.4751484839012191, + "grad_norm": 0.228515625, + "learning_rate": 0.00019319596239436698, + "loss": 1.6178, + "step": 1520 + }, + { + "epoch": 0.47546108158799627, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019318705122560602, + "loss": 1.5581, + "step": 1521 + }, + { + "epoch": 0.4757736792747734, + "grad_norm": 0.2265625, + "learning_rate": 0.00019317813443102408, + "loss": 1.6904, + "step": 1522 + }, + { + "epoch": 0.4760862769615505, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001931692120111594, + "loss": 1.9162, + "step": 1523 + }, + { + "epoch": 0.4763988746483276, + "grad_norm": 0.2265625, + "learning_rate": 0.0001931602839665507, + "loss": 1.6703, + "step": 1524 + }, + { + "epoch": 0.47671147233510475, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001931513502977369, + "loss": 1.6865, + "step": 1525 + }, + { + "epoch": 0.47702407002188185, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019314241100525738, + "loss": 1.7221, + "step": 1526 + }, + { + "epoch": 0.47733666770865896, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019313346608965183, + "loss": 1.6306, + "step": 1527 + }, + { + "epoch": 0.47764926539543606, + "grad_norm": 0.224609375, + "learning_rate": 0.00019312451555146022, + "loss": 2.0435, + "step": 1528 + }, + { + "epoch": 0.47796186308221317, + "grad_norm": 0.2265625, + "learning_rate": 0.00019311555939122298, + "loss": 1.4892, + "step": 1529 + }, + { + "epoch": 0.47827446076899033, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019310659760948075, + "loss": 1.7291, + "step": 1530 + }, + { + "epoch": 0.47858705845576743, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019309763020677458, + "loss": 1.7014, + "step": 1531 + }, + { + "epoch": 0.47889965614254454, + "grad_norm": 0.23828125, + "learning_rate": 0.00019308865718364583, + "loss": 2.0065, + "step": 1532 + }, + { + "epoch": 0.47921225382932164, + "grad_norm": 0.23046875, + "learning_rate": 0.00019307967854063622, + "loss": 1.5883, + "step": 1533 + }, + { + "epoch": 0.4795248515160988, + "grad_norm": 0.236328125, + "learning_rate": 0.0001930706942782878, + "loss": 1.7971, + "step": 1534 + }, + { + "epoch": 0.4798374492028759, + "grad_norm": 0.224609375, + "learning_rate": 0.00019306170439714298, + "loss": 1.6701, + "step": 1535 + }, + { + "epoch": 0.480150046889653, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019305270889774444, + "loss": 1.611, + "step": 1536 + }, + { + "epoch": 0.4804626445764301, + "grad_norm": 0.240234375, + "learning_rate": 0.00019304370778063534, + "loss": 1.8515, + "step": 1537 + }, + { + "epoch": 0.4807752422632072, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019303470104635898, + "loss": 1.64, + "step": 1538 + }, + { + "epoch": 0.4810878399499844, + "grad_norm": 0.228515625, + "learning_rate": 0.0001930256886954592, + "loss": 1.7283, + "step": 1539 + }, + { + "epoch": 0.4814004376367615, + "grad_norm": 0.244140625, + "learning_rate": 0.00019301667072848004, + "loss": 1.8076, + "step": 1540 + }, + { + "epoch": 0.4817130353235386, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019300764714596594, + "loss": 1.9384, + "step": 1541 + }, + { + "epoch": 0.4820256330103157, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019299861794846166, + "loss": 1.8492, + "step": 1542 + }, + { + "epoch": 0.48233823069709286, + "grad_norm": 0.21875, + "learning_rate": 0.00019298958313651227, + "loss": 1.744, + "step": 1543 + }, + { + "epoch": 0.48265082838386997, + "grad_norm": 0.220703125, + "learning_rate": 0.0001929805427106633, + "loss": 1.7691, + "step": 1544 + }, + { + "epoch": 0.4829634260706471, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019297149667146045, + "loss": 1.6095, + "step": 1545 + }, + { + "epoch": 0.4832760237574242, + "grad_norm": 0.2099609375, + "learning_rate": 0.0001929624450194499, + "loss": 1.8153, + "step": 1546 + }, + { + "epoch": 0.48358862144420134, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019295338775517803, + "loss": 1.8315, + "step": 1547 + }, + { + "epoch": 0.48390121913097844, + "grad_norm": 0.208984375, + "learning_rate": 0.00019294432487919173, + "loss": 1.6651, + "step": 1548 + }, + { + "epoch": 0.48421381681775555, + "grad_norm": 0.20703125, + "learning_rate": 0.0001929352563920381, + "loss": 1.632, + "step": 1549 + }, + { + "epoch": 0.48452641450453265, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001929261822942646, + "loss": 1.5682, + "step": 1550 + }, + { + "epoch": 0.48483901219130976, + "grad_norm": 0.220703125, + "learning_rate": 0.00019291710258641907, + "loss": 1.7631, + "step": 1551 + }, + { + "epoch": 0.4851516098780869, + "grad_norm": 0.212890625, + "learning_rate": 0.00019290801726904962, + "loss": 1.6418, + "step": 1552 + }, + { + "epoch": 0.485464207564864, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001928989263427048, + "loss": 1.4744, + "step": 1553 + }, + { + "epoch": 0.48577680525164113, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001928898298079334, + "loss": 1.7507, + "step": 1554 + }, + { + "epoch": 0.48608940293841824, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019288072766528462, + "loss": 1.5483, + "step": 1555 + }, + { + "epoch": 0.4864020006251954, + "grad_norm": 0.228515625, + "learning_rate": 0.00019287161991530792, + "loss": 1.7318, + "step": 1556 + }, + { + "epoch": 0.4867145983119725, + "grad_norm": 0.228515625, + "learning_rate": 0.0001928625065585532, + "loss": 1.8483, + "step": 1557 + }, + { + "epoch": 0.4870271959987496, + "grad_norm": 0.21875, + "learning_rate": 0.00019285338759557065, + "loss": 1.6431, + "step": 1558 + }, + { + "epoch": 0.4873397936855267, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019284426302691073, + "loss": 1.6648, + "step": 1559 + }, + { + "epoch": 0.4876523913723038, + "grad_norm": 0.21484375, + "learning_rate": 0.00019283513285312437, + "loss": 1.5061, + "step": 1560 + }, + { + "epoch": 0.487964989059081, + "grad_norm": 0.310546875, + "learning_rate": 0.0001928259970747627, + "loss": 2.72, + "step": 1561 + }, + { + "epoch": 0.4882775867458581, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019281685569237734, + "loss": 1.6893, + "step": 1562 + }, + { + "epoch": 0.4885901844326352, + "grad_norm": 0.216796875, + "learning_rate": 0.0001928077087065201, + "loss": 1.6951, + "step": 1563 + }, + { + "epoch": 0.4889027821194123, + "grad_norm": 0.220703125, + "learning_rate": 0.0001927985561177432, + "loss": 1.7366, + "step": 1564 + }, + { + "epoch": 0.48921537980618945, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019278939792659924, + "loss": 1.7637, + "step": 1565 + }, + { + "epoch": 0.48952797749296656, + "grad_norm": 0.212890625, + "learning_rate": 0.00019278023413364106, + "loss": 1.5522, + "step": 1566 + }, + { + "epoch": 0.48984057517974366, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019277106473942194, + "loss": 1.8184, + "step": 1567 + }, + { + "epoch": 0.49015317286652077, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019276188974449543, + "loss": 1.5573, + "step": 1568 + }, + { + "epoch": 0.49046577055329793, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019275270914941538, + "loss": 1.5074, + "step": 1569 + }, + { + "epoch": 0.49077836824007504, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019274352295473612, + "loss": 1.9685, + "step": 1570 + }, + { + "epoch": 0.49109096592685214, + "grad_norm": 0.23046875, + "learning_rate": 0.00019273433116101217, + "loss": 1.8918, + "step": 1571 + }, + { + "epoch": 0.49140356361362925, + "grad_norm": 0.240234375, + "learning_rate": 0.00019272513376879854, + "loss": 1.8173, + "step": 1572 + }, + { + "epoch": 0.49171616130040635, + "grad_norm": 0.220703125, + "learning_rate": 0.00019271593077865035, + "loss": 1.7093, + "step": 1573 + }, + { + "epoch": 0.4920287589871835, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019270672219112332, + "loss": 1.7993, + "step": 1574 + }, + { + "epoch": 0.4923413566739606, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019269750800677331, + "loss": 1.7468, + "step": 1575 + }, + { + "epoch": 0.4926539543607377, + "grad_norm": 0.208984375, + "learning_rate": 0.00019268828822615661, + "loss": 1.4455, + "step": 1576 + }, + { + "epoch": 0.4929665520475148, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019267906284982985, + "loss": 1.9409, + "step": 1577 + }, + { + "epoch": 0.493279149734292, + "grad_norm": 0.220703125, + "learning_rate": 0.00019266983187834995, + "loss": 1.8848, + "step": 1578 + }, + { + "epoch": 0.4935917474210691, + "grad_norm": 0.220703125, + "learning_rate": 0.0001926605953122742, + "loss": 1.5927, + "step": 1579 + }, + { + "epoch": 0.4939043451078462, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019265135315216028, + "loss": 1.7506, + "step": 1580 + }, + { + "epoch": 0.4942169427946233, + "grad_norm": 0.22265625, + "learning_rate": 0.00019264210539856607, + "loss": 1.7024, + "step": 1581 + }, + { + "epoch": 0.49452954048140046, + "grad_norm": 0.228515625, + "learning_rate": 0.0001926328520520499, + "loss": 1.8899, + "step": 1582 + }, + { + "epoch": 0.49484213816817757, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001926235931131704, + "loss": 1.7209, + "step": 1583 + }, + { + "epoch": 0.4951547358549547, + "grad_norm": 0.22265625, + "learning_rate": 0.00019261432858248657, + "loss": 1.582, + "step": 1584 + }, + { + "epoch": 0.4954673335417318, + "grad_norm": 0.216796875, + "learning_rate": 0.0001926050584605577, + "loss": 1.7583, + "step": 1585 + }, + { + "epoch": 0.4957799312285089, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019259578274794344, + "loss": 1.7366, + "step": 1586 + }, + { + "epoch": 0.49609252891528605, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001925865014452038, + "loss": 1.7721, + "step": 1587 + }, + { + "epoch": 0.49640512660206315, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019257721455289906, + "loss": 1.9818, + "step": 1588 + }, + { + "epoch": 0.49671772428884026, + "grad_norm": 0.220703125, + "learning_rate": 0.00019256792207158991, + "loss": 1.719, + "step": 1589 + }, + { + "epoch": 0.49703032197561736, + "grad_norm": 0.220703125, + "learning_rate": 0.00019255862400183733, + "loss": 1.7085, + "step": 1590 + }, + { + "epoch": 0.4973429196623945, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019254932034420266, + "loss": 1.5593, + "step": 1591 + }, + { + "epoch": 0.4976555173491716, + "grad_norm": 0.220703125, + "learning_rate": 0.00019254001109924763, + "loss": 1.6743, + "step": 1592 + }, + { + "epoch": 0.49796811503594873, + "grad_norm": 0.220703125, + "learning_rate": 0.0001925306962675342, + "loss": 1.5977, + "step": 1593 + }, + { + "epoch": 0.49828071272272584, + "grad_norm": 0.216796875, + "learning_rate": 0.00019252137584962472, + "loss": 1.6007, + "step": 1594 + }, + { + "epoch": 0.49859331040950294, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019251204984608184, + "loss": 1.5078, + "step": 1595 + }, + { + "epoch": 0.4989059080962801, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019250271825746866, + "loss": 1.9624, + "step": 1596 + }, + { + "epoch": 0.4992185057830572, + "grad_norm": 0.21484375, + "learning_rate": 0.0001924933810843485, + "loss": 1.6749, + "step": 1597 + }, + { + "epoch": 0.4995311034698343, + "grad_norm": 0.23046875, + "learning_rate": 0.00019248403832728504, + "loss": 1.7965, + "step": 1598 + }, + { + "epoch": 0.4998437011566114, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019247468998684233, + "loss": 1.7333, + "step": 1599 + }, + { + "epoch": 0.5001562988433885, + "grad_norm": 0.21484375, + "learning_rate": 0.00019246533606358476, + "loss": 1.9014, + "step": 1600 + }, + { + "epoch": 0.5001562988433885, + "eval_loss": 1.6468836069107056, + "eval_runtime": 1904.4552, + "eval_samples_per_second": 4.798, + "eval_steps_per_second": 2.399, + "step": 1600 + }, + { + "epoch": 0.5004688965301657, + "grad_norm": 0.2060546875, + "learning_rate": 0.000192455976558077, + "loss": 1.8399, + "step": 1601 + }, + { + "epoch": 0.5007814942169428, + "grad_norm": 0.22265625, + "learning_rate": 0.00019244661147088413, + "loss": 1.7516, + "step": 1602 + }, + { + "epoch": 0.5010940919037199, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019243724080257154, + "loss": 1.6023, + "step": 1603 + }, + { + "epoch": 0.5014066895904971, + "grad_norm": 0.240234375, + "learning_rate": 0.0001924278645537049, + "loss": 1.8678, + "step": 1604 + }, + { + "epoch": 0.5017192872772741, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001924184827248503, + "loss": 1.8877, + "step": 1605 + }, + { + "epoch": 0.5020318849640513, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019240909531657415, + "loss": 1.7109, + "step": 1606 + }, + { + "epoch": 0.5023444826508284, + "grad_norm": 0.21484375, + "learning_rate": 0.00019239970232944314, + "loss": 1.9394, + "step": 1607 + }, + { + "epoch": 0.5026570803376055, + "grad_norm": 0.2265625, + "learning_rate": 0.00019239030376402437, + "loss": 1.6907, + "step": 1608 + }, + { + "epoch": 0.5029696780243826, + "grad_norm": 0.21875, + "learning_rate": 0.00019238089962088522, + "loss": 1.3726, + "step": 1609 + }, + { + "epoch": 0.5032822757111597, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019237148990059342, + "loss": 1.4186, + "step": 1610 + }, + { + "epoch": 0.5035948733979368, + "grad_norm": 0.232421875, + "learning_rate": 0.00019236207460371707, + "loss": 1.8961, + "step": 1611 + }, + { + "epoch": 0.503907471084714, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001923526537308246, + "loss": 1.5122, + "step": 1612 + }, + { + "epoch": 0.5042200687714911, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019234322728248473, + "loss": 1.6718, + "step": 1613 + }, + { + "epoch": 0.5045326664582682, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019233379525926652, + "loss": 1.5157, + "step": 1614 + }, + { + "epoch": 0.5048452641450454, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019232435766173946, + "loss": 1.8013, + "step": 1615 + }, + { + "epoch": 0.5051578618318224, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019231491449047327, + "loss": 1.6126, + "step": 1616 + }, + { + "epoch": 0.5054704595185996, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019230546574603805, + "loss": 1.9199, + "step": 1617 + }, + { + "epoch": 0.5057830572053766, + "grad_norm": 0.216796875, + "learning_rate": 0.00019229601142900426, + "loss": 1.8629, + "step": 1618 + }, + { + "epoch": 0.5060956548921538, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001922865515399426, + "loss": 1.9572, + "step": 1619 + }, + { + "epoch": 0.506408252578931, + "grad_norm": 0.212890625, + "learning_rate": 0.0001922770860794243, + "loss": 1.8666, + "step": 1620 + }, + { + "epoch": 0.506720850265708, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019226761504802066, + "loss": 1.6269, + "step": 1621 + }, + { + "epoch": 0.5070334479524852, + "grad_norm": 0.212890625, + "learning_rate": 0.00019225813844630355, + "loss": 1.4542, + "step": 1622 + }, + { + "epoch": 0.5073460456392622, + "grad_norm": 0.232421875, + "learning_rate": 0.00019224865627484502, + "loss": 1.726, + "step": 1623 + }, + { + "epoch": 0.5076586433260394, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019223916853421756, + "loss": 1.9227, + "step": 1624 + }, + { + "epoch": 0.5079712410128165, + "grad_norm": 0.232421875, + "learning_rate": 0.000192229675224994, + "loss": 1.7876, + "step": 1625 + }, + { + "epoch": 0.5082838386995936, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001922201763477474, + "loss": 1.9213, + "step": 1626 + }, + { + "epoch": 0.5085964363863708, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019221067190305121, + "loss": 1.8536, + "step": 1627 + }, + { + "epoch": 0.5089090340731478, + "grad_norm": 0.236328125, + "learning_rate": 0.00019220116189147928, + "loss": 1.7391, + "step": 1628 + }, + { + "epoch": 0.509221631759925, + "grad_norm": 0.22265625, + "learning_rate": 0.00019219164631360572, + "loss": 1.5871, + "step": 1629 + }, + { + "epoch": 0.5095342294467021, + "grad_norm": 0.236328125, + "learning_rate": 0.00019218212517000497, + "loss": 1.7358, + "step": 1630 + }, + { + "epoch": 0.5098468271334792, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019217259846125186, + "loss": 1.7538, + "step": 1631 + }, + { + "epoch": 0.5101594248202563, + "grad_norm": 0.228515625, + "learning_rate": 0.00019216306618792151, + "loss": 2.0148, + "step": 1632 + }, + { + "epoch": 0.5104720225070335, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019215352835058944, + "loss": 1.655, + "step": 1633 + }, + { + "epoch": 0.5107846201938105, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001921439849498314, + "loss": 1.8552, + "step": 1634 + }, + { + "epoch": 0.5110972178805877, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001921344359862236, + "loss": 2.0283, + "step": 1635 + }, + { + "epoch": 0.5114098155673648, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019212488146034247, + "loss": 1.8859, + "step": 1636 + }, + { + "epoch": 0.5117224132541419, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019211532137276485, + "loss": 1.7173, + "step": 1637 + }, + { + "epoch": 0.5120350109409191, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001921057557240679, + "loss": 1.6262, + "step": 1638 + }, + { + "epoch": 0.5123476086276961, + "grad_norm": 0.234375, + "learning_rate": 0.00019209618451482911, + "loss": 1.6141, + "step": 1639 + }, + { + "epoch": 0.5126602063144733, + "grad_norm": 0.224609375, + "learning_rate": 0.0001920866077456263, + "loss": 1.7475, + "step": 1640 + }, + { + "epoch": 0.5129728040012503, + "grad_norm": 0.23828125, + "learning_rate": 0.0001920770254170376, + "loss": 1.7333, + "step": 1641 + }, + { + "epoch": 0.5132854016880275, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001920674375296416, + "loss": 1.8058, + "step": 1642 + }, + { + "epoch": 0.5135979993748047, + "grad_norm": 0.216796875, + "learning_rate": 0.00019205784408401705, + "loss": 1.5659, + "step": 1643 + }, + { + "epoch": 0.5139105970615817, + "grad_norm": 0.2421875, + "learning_rate": 0.00019204824508074314, + "loss": 1.6922, + "step": 1644 + }, + { + "epoch": 0.5142231947483589, + "grad_norm": 0.20703125, + "learning_rate": 0.00019203864052039937, + "loss": 1.5329, + "step": 1645 + }, + { + "epoch": 0.514535792435136, + "grad_norm": 0.2265625, + "learning_rate": 0.00019202903040356557, + "loss": 1.5799, + "step": 1646 + }, + { + "epoch": 0.5148483901219131, + "grad_norm": 0.216796875, + "learning_rate": 0.00019201941473082196, + "loss": 1.7131, + "step": 1647 + }, + { + "epoch": 0.5151609878086902, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019200979350274898, + "loss": 1.668, + "step": 1648 + }, + { + "epoch": 0.5154735854954673, + "grad_norm": 0.22265625, + "learning_rate": 0.00019200016671992755, + "loss": 1.8212, + "step": 1649 + }, + { + "epoch": 0.5157861831822445, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019199053438293884, + "loss": 1.745, + "step": 1650 + }, + { + "epoch": 0.5160987808690216, + "grad_norm": 0.224609375, + "learning_rate": 0.0001919808964923643, + "loss": 1.9392, + "step": 1651 + }, + { + "epoch": 0.5164113785557987, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019197125304878587, + "loss": 1.8001, + "step": 1652 + }, + { + "epoch": 0.5167239762425758, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019196160405278567, + "loss": 1.6449, + "step": 1653 + }, + { + "epoch": 0.5170365739293529, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019195194950494623, + "loss": 1.7974, + "step": 1654 + }, + { + "epoch": 0.51734917161613, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019194228940585043, + "loss": 1.6213, + "step": 1655 + }, + { + "epoch": 0.5176617693029072, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001919326237560815, + "loss": 1.7459, + "step": 1656 + }, + { + "epoch": 0.5179743669896842, + "grad_norm": 0.23046875, + "learning_rate": 0.00019192295255622286, + "loss": 2.0187, + "step": 1657 + }, + { + "epoch": 0.5182869646764614, + "grad_norm": 0.22265625, + "learning_rate": 0.00019191327580685846, + "loss": 1.465, + "step": 1658 + }, + { + "epoch": 0.5185995623632386, + "grad_norm": 0.21875, + "learning_rate": 0.0001919035935085725, + "loss": 1.7626, + "step": 1659 + }, + { + "epoch": 0.5189121600500156, + "grad_norm": 0.228515625, + "learning_rate": 0.00019189390566194943, + "loss": 1.6333, + "step": 1660 + }, + { + "epoch": 0.5192247577367928, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019188421226757423, + "loss": 1.6854, + "step": 1661 + }, + { + "epoch": 0.5195373554235698, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019187451332603202, + "loss": 1.5598, + "step": 1662 + }, + { + "epoch": 0.519849953110347, + "grad_norm": 0.224609375, + "learning_rate": 0.00019186480883790836, + "loss": 1.7953, + "step": 1663 + }, + { + "epoch": 0.5201625507971241, + "grad_norm": 0.22265625, + "learning_rate": 0.00019185509880378912, + "loss": 1.7901, + "step": 1664 + }, + { + "epoch": 0.5204751484839012, + "grad_norm": 0.23046875, + "learning_rate": 0.00019184538322426054, + "loss": 1.6819, + "step": 1665 + }, + { + "epoch": 0.5207877461706784, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019183566209990911, + "loss": 1.8034, + "step": 1666 + }, + { + "epoch": 0.5211003438574554, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019182593543132174, + "loss": 2.0384, + "step": 1667 + }, + { + "epoch": 0.5214129415442326, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019181620321908564, + "loss": 1.9369, + "step": 1668 + }, + { + "epoch": 0.5217255392310097, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019180646546378832, + "loss": 1.8764, + "step": 1669 + }, + { + "epoch": 0.5220381369177868, + "grad_norm": 0.220703125, + "learning_rate": 0.00019179672216601773, + "loss": 1.6419, + "step": 1670 + }, + { + "epoch": 0.5223507346045639, + "grad_norm": 0.408203125, + "learning_rate": 0.00019178697332636202, + "loss": 2.427, + "step": 1671 + }, + { + "epoch": 0.5226633322913411, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019177721894540975, + "loss": 1.81, + "step": 1672 + }, + { + "epoch": 0.5229759299781181, + "grad_norm": 0.216796875, + "learning_rate": 0.0001917674590237499, + "loss": 1.67, + "step": 1673 + }, + { + "epoch": 0.5232885276648953, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019175769356197153, + "loss": 1.6198, + "step": 1674 + }, + { + "epoch": 0.5236011253516724, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001917479225606643, + "loss": 1.8033, + "step": 1675 + }, + { + "epoch": 0.5239137230384495, + "grad_norm": 0.220703125, + "learning_rate": 0.00019173814602041803, + "loss": 1.6005, + "step": 1676 + }, + { + "epoch": 0.5242263207252267, + "grad_norm": 0.22265625, + "learning_rate": 0.00019172836394182303, + "loss": 1.6983, + "step": 1677 + }, + { + "epoch": 0.5245389184120037, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019171857632546978, + "loss": 1.8186, + "step": 1678 + }, + { + "epoch": 0.5248515160987809, + "grad_norm": 0.220703125, + "learning_rate": 0.00019170878317194924, + "loss": 1.6052, + "step": 1679 + }, + { + "epoch": 0.5251641137855579, + "grad_norm": 0.23828125, + "learning_rate": 0.00019169898448185256, + "loss": 1.7156, + "step": 1680 + }, + { + "epoch": 0.5254767114723351, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019168918025577134, + "loss": 1.7039, + "step": 1681 + }, + { + "epoch": 0.5257893091591123, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019167937049429745, + "loss": 1.8326, + "step": 1682 + }, + { + "epoch": 0.5261019068458893, + "grad_norm": 0.228515625, + "learning_rate": 0.00019166955519802316, + "loss": 1.6872, + "step": 1683 + }, + { + "epoch": 0.5264145045326665, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019165973436754098, + "loss": 1.6172, + "step": 1684 + }, + { + "epoch": 0.5267271022194435, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019164990800344387, + "loss": 1.7482, + "step": 1685 + }, + { + "epoch": 0.5270396999062207, + "grad_norm": 0.224609375, + "learning_rate": 0.000191640076106325, + "loss": 1.6177, + "step": 1686 + }, + { + "epoch": 0.5273522975929978, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019163023867677797, + "loss": 1.6793, + "step": 1687 + }, + { + "epoch": 0.5276648952797749, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019162039571539666, + "loss": 1.6634, + "step": 1688 + }, + { + "epoch": 0.527977492966552, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001916105472227753, + "loss": 1.7808, + "step": 1689 + }, + { + "epoch": 0.5282900906533292, + "grad_norm": 0.228515625, + "learning_rate": 0.00019160069319950845, + "loss": 1.7203, + "step": 1690 + }, + { + "epoch": 0.5286026883401063, + "grad_norm": 0.23046875, + "learning_rate": 0.00019159083364619103, + "loss": 1.6893, + "step": 1691 + }, + { + "epoch": 0.5289152860268834, + "grad_norm": 0.349609375, + "learning_rate": 0.0001915809685634183, + "loss": 2.3232, + "step": 1692 + }, + { + "epoch": 0.5292278837136605, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001915710979517858, + "loss": 1.554, + "step": 1693 + }, + { + "epoch": 0.5295404814004376, + "grad_norm": 0.234375, + "learning_rate": 0.0001915612218118894, + "loss": 1.6621, + "step": 1694 + }, + { + "epoch": 0.5298530790872148, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019155134014432534, + "loss": 1.8881, + "step": 1695 + }, + { + "epoch": 0.5301656767739918, + "grad_norm": 0.22265625, + "learning_rate": 0.00019154145294969022, + "loss": 1.8313, + "step": 1696 + }, + { + "epoch": 0.530478274460769, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019153156022858094, + "loss": 1.7908, + "step": 1697 + }, + { + "epoch": 0.5307908721475461, + "grad_norm": 0.224609375, + "learning_rate": 0.00019152166198159476, + "loss": 1.6425, + "step": 1698 + }, + { + "epoch": 0.5311034698343232, + "grad_norm": 0.21484375, + "learning_rate": 0.00019151175820932917, + "loss": 1.7114, + "step": 1699 + }, + { + "epoch": 0.5314160675211004, + "grad_norm": 0.2109375, + "learning_rate": 0.00019150184891238216, + "loss": 1.5121, + "step": 1700 + }, + { + "epoch": 0.5317286652078774, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019149193409135192, + "loss": 1.7762, + "step": 1701 + }, + { + "epoch": 0.5320412628946546, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019148201374683704, + "loss": 1.8021, + "step": 1702 + }, + { + "epoch": 0.5323538605814317, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019147208787943638, + "loss": 1.8559, + "step": 1703 + }, + { + "epoch": 0.5326664582682088, + "grad_norm": 0.33984375, + "learning_rate": 0.00019146215648974924, + "loss": 2.3382, + "step": 1704 + }, + { + "epoch": 0.532979055954986, + "grad_norm": 0.23046875, + "learning_rate": 0.00019145221957837515, + "loss": 1.6269, + "step": 1705 + }, + { + "epoch": 0.533291653641763, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019144227714591402, + "loss": 1.8329, + "step": 1706 + }, + { + "epoch": 0.5336042513285402, + "grad_norm": 0.23046875, + "learning_rate": 0.0001914323291929661, + "loss": 1.7395, + "step": 1707 + }, + { + "epoch": 0.5339168490153173, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019142237572013197, + "loss": 1.4983, + "step": 1708 + }, + { + "epoch": 0.5342294467020944, + "grad_norm": 0.220703125, + "learning_rate": 0.00019141241672801247, + "loss": 1.7625, + "step": 1709 + }, + { + "epoch": 0.5345420443888715, + "grad_norm": 0.23046875, + "learning_rate": 0.0001914024522172089, + "loss": 1.8429, + "step": 1710 + }, + { + "epoch": 0.5348546420756486, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019139248218832285, + "loss": 1.9247, + "step": 1711 + }, + { + "epoch": 0.5351672397624258, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019138250664195615, + "loss": 1.6563, + "step": 1712 + }, + { + "epoch": 0.5354798374492029, + "grad_norm": 0.216796875, + "learning_rate": 0.0001913725255787111, + "loss": 1.5108, + "step": 1713 + }, + { + "epoch": 0.53579243513598, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019136253899919024, + "loss": 1.8109, + "step": 1714 + }, + { + "epoch": 0.5361050328227571, + "grad_norm": 0.21875, + "learning_rate": 0.00019135254690399648, + "loss": 1.7063, + "step": 1715 + }, + { + "epoch": 0.5364176305095343, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019134254929373303, + "loss": 1.7218, + "step": 1716 + }, + { + "epoch": 0.5367302281963113, + "grad_norm": 0.232421875, + "learning_rate": 0.00019133254616900347, + "loss": 1.6555, + "step": 1717 + }, + { + "epoch": 0.5370428258830885, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019132253753041174, + "loss": 1.9246, + "step": 1718 + }, + { + "epoch": 0.5373554235698655, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019131252337856205, + "loss": 1.818, + "step": 1719 + }, + { + "epoch": 0.5376680212566427, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019130250371405895, + "loss": 1.6691, + "step": 1720 + }, + { + "epoch": 0.5379806189434199, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019129247853750733, + "loss": 1.6272, + "step": 1721 + }, + { + "epoch": 0.5382932166301969, + "grad_norm": 0.2109375, + "learning_rate": 0.0001912824478495125, + "loss": 1.529, + "step": 1722 + }, + { + "epoch": 0.5386058143169741, + "grad_norm": 0.224609375, + "learning_rate": 0.00019127241165067994, + "loss": 1.8957, + "step": 1723 + }, + { + "epoch": 0.5389184120037511, + "grad_norm": 0.216796875, + "learning_rate": 0.00019126236994161558, + "loss": 1.6643, + "step": 1724 + }, + { + "epoch": 0.5392310096905283, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019125232272292563, + "loss": 1.8746, + "step": 1725 + }, + { + "epoch": 0.5395436073773054, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019124226999521672, + "loss": 1.5691, + "step": 1726 + }, + { + "epoch": 0.5398562050640825, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019123221175909567, + "loss": 1.7902, + "step": 1727 + }, + { + "epoch": 0.5401688027508597, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019122214801516973, + "loss": 1.6767, + "step": 1728 + }, + { + "epoch": 0.5404814004376368, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019121207876404648, + "loss": 1.727, + "step": 1729 + }, + { + "epoch": 0.5407939981244139, + "grad_norm": 0.228515625, + "learning_rate": 0.0001912020040063338, + "loss": 1.6355, + "step": 1730 + }, + { + "epoch": 0.541106595811191, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019119192374263992, + "loss": 1.9062, + "step": 1731 + }, + { + "epoch": 0.5414191934979681, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019118183797357338, + "loss": 1.5986, + "step": 1732 + }, + { + "epoch": 0.5417317911847452, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019117174669974312, + "loss": 1.5961, + "step": 1733 + }, + { + "epoch": 0.5420443888715224, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019116164992175828, + "loss": 1.8585, + "step": 1734 + }, + { + "epoch": 0.5423569865582994, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019115154764022852, + "loss": 1.731, + "step": 1735 + }, + { + "epoch": 0.5426695842450766, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019114143985576366, + "loss": 1.9891, + "step": 1736 + }, + { + "epoch": 0.5429821819318537, + "grad_norm": 0.23046875, + "learning_rate": 0.0001911313265689739, + "loss": 1.6551, + "step": 1737 + }, + { + "epoch": 0.5432947796186308, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019112120778046987, + "loss": 2.0219, + "step": 1738 + }, + { + "epoch": 0.543607377305408, + "grad_norm": 0.21875, + "learning_rate": 0.0001911110834908624, + "loss": 1.7808, + "step": 1739 + }, + { + "epoch": 0.543919974992185, + "grad_norm": 0.23046875, + "learning_rate": 0.0001911009537007627, + "loss": 1.7043, + "step": 1740 + }, + { + "epoch": 0.5442325726789622, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019109081841078233, + "loss": 1.7296, + "step": 1741 + }, + { + "epoch": 0.5445451703657392, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001910806776215332, + "loss": 1.6465, + "step": 1742 + }, + { + "epoch": 0.5448577680525164, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019107053133362749, + "loss": 1.8411, + "step": 1743 + }, + { + "epoch": 0.5451703657392936, + "grad_norm": 0.220703125, + "learning_rate": 0.00019106037954767774, + "loss": 1.4522, + "step": 1744 + }, + { + "epoch": 0.5454829634260706, + "grad_norm": 0.212890625, + "learning_rate": 0.00019105022226429682, + "loss": 1.7463, + "step": 1745 + }, + { + "epoch": 0.5457955611128478, + "grad_norm": 0.22265625, + "learning_rate": 0.00019104005948409797, + "loss": 1.622, + "step": 1746 + }, + { + "epoch": 0.5461081587996249, + "grad_norm": 0.234375, + "learning_rate": 0.00019102989120769475, + "loss": 1.8334, + "step": 1747 + }, + { + "epoch": 0.546420756486402, + "grad_norm": 0.236328125, + "learning_rate": 0.00019101971743570094, + "loss": 1.6375, + "step": 1748 + }, + { + "epoch": 0.5467333541731791, + "grad_norm": 0.224609375, + "learning_rate": 0.00019100953816873084, + "loss": 1.4945, + "step": 1749 + }, + { + "epoch": 0.5470459518599562, + "grad_norm": 0.21875, + "learning_rate": 0.00019099935340739893, + "loss": 1.687, + "step": 1750 + }, + { + "epoch": 0.5473585495467334, + "grad_norm": 0.251953125, + "learning_rate": 0.0001909891631523201, + "loss": 1.8769, + "step": 1751 + }, + { + "epoch": 0.5476711472335105, + "grad_norm": 0.228515625, + "learning_rate": 0.00019097896740410955, + "loss": 1.814, + "step": 1752 + }, + { + "epoch": 0.5479837449202876, + "grad_norm": 0.224609375, + "learning_rate": 0.00019096876616338278, + "loss": 1.8215, + "step": 1753 + }, + { + "epoch": 0.5482963426070647, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019095855943075568, + "loss": 1.6682, + "step": 1754 + }, + { + "epoch": 0.5486089402938418, + "grad_norm": 0.234375, + "learning_rate": 0.00019094834720684447, + "loss": 1.8052, + "step": 1755 + }, + { + "epoch": 0.5489215379806189, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001909381294922656, + "loss": 1.7685, + "step": 1756 + }, + { + "epoch": 0.5492341356673961, + "grad_norm": 0.2392578125, + "learning_rate": 0.000190927906287636, + "loss": 1.6704, + "step": 1757 + }, + { + "epoch": 0.5495467333541731, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001909176775935728, + "loss": 1.75, + "step": 1758 + }, + { + "epoch": 0.5498593310409503, + "grad_norm": 0.240234375, + "learning_rate": 0.00019090744341069356, + "loss": 1.5139, + "step": 1759 + }, + { + "epoch": 0.5501719287277275, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019089720373961612, + "loss": 1.5844, + "step": 1760 + }, + { + "epoch": 0.5504845264145045, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019088695858095864, + "loss": 1.7899, + "step": 1761 + }, + { + "epoch": 0.5507971241012817, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019087670793533967, + "loss": 1.7717, + "step": 1762 + }, + { + "epoch": 0.5511097217880587, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019086645180337803, + "loss": 1.7754, + "step": 1763 + }, + { + "epoch": 0.5514223194748359, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001908561901856929, + "loss": 1.8412, + "step": 1764 + }, + { + "epoch": 0.551734917161613, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001908459230829038, + "loss": 1.7254, + "step": 1765 + }, + { + "epoch": 0.5520475148483901, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019083565049563057, + "loss": 1.8097, + "step": 1766 + }, + { + "epoch": 0.5523601125351673, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019082537242449333, + "loss": 1.8441, + "step": 1767 + }, + { + "epoch": 0.5526727102219443, + "grad_norm": 0.328125, + "learning_rate": 0.00019081508887011263, + "loss": 2.4757, + "step": 1768 + }, + { + "epoch": 0.5529853079087215, + "grad_norm": 0.21875, + "learning_rate": 0.0001908047998331093, + "loss": 1.5833, + "step": 1769 + }, + { + "epoch": 0.5532979055954986, + "grad_norm": 0.359375, + "learning_rate": 0.0001907945053141045, + "loss": 2.4293, + "step": 1770 + }, + { + "epoch": 0.5536105032822757, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001907842053137197, + "loss": 1.9397, + "step": 1771 + }, + { + "epoch": 0.5539231009690528, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001907738998325767, + "loss": 2.0662, + "step": 1772 + }, + { + "epoch": 0.55423569865583, + "grad_norm": 0.228515625, + "learning_rate": 0.00019076358887129774, + "loss": 1.8447, + "step": 1773 + }, + { + "epoch": 0.554548296342607, + "grad_norm": 0.32421875, + "learning_rate": 0.00019075327243050526, + "loss": 2.3451, + "step": 1774 + }, + { + "epoch": 0.5548608940293842, + "grad_norm": 0.228515625, + "learning_rate": 0.00019074295051082205, + "loss": 1.623, + "step": 1775 + }, + { + "epoch": 0.5551734917161613, + "grad_norm": 0.240234375, + "learning_rate": 0.0001907326231128713, + "loss": 2.0579, + "step": 1776 + }, + { + "epoch": 0.5554860894029384, + "grad_norm": 0.216796875, + "learning_rate": 0.00019072229023727645, + "loss": 1.6111, + "step": 1777 + }, + { + "epoch": 0.5557986870897156, + "grad_norm": 0.224609375, + "learning_rate": 0.00019071195188466135, + "loss": 1.87, + "step": 1778 + }, + { + "epoch": 0.5561112847764926, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019070160805565012, + "loss": 1.6437, + "step": 1779 + }, + { + "epoch": 0.5564238824632698, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019069125875086722, + "loss": 1.6752, + "step": 1780 + }, + { + "epoch": 0.5567364801500468, + "grad_norm": 0.236328125, + "learning_rate": 0.00019068090397093745, + "loss": 1.7323, + "step": 1781 + }, + { + "epoch": 0.557049077836824, + "grad_norm": 0.228515625, + "learning_rate": 0.000190670543716486, + "loss": 1.7324, + "step": 1782 + }, + { + "epoch": 0.5573616755236012, + "grad_norm": 0.22265625, + "learning_rate": 0.00019066017798813825, + "loss": 1.5224, + "step": 1783 + }, + { + "epoch": 0.5576742732103782, + "grad_norm": 0.326171875, + "learning_rate": 0.00019064980678652, + "loss": 2.3167, + "step": 1784 + }, + { + "epoch": 0.5579868708971554, + "grad_norm": 0.212890625, + "learning_rate": 0.00019063943011225743, + "loss": 1.7731, + "step": 1785 + }, + { + "epoch": 0.5582994685839325, + "grad_norm": 0.23828125, + "learning_rate": 0.00019062904796597697, + "loss": 1.6789, + "step": 1786 + }, + { + "epoch": 0.5586120662707096, + "grad_norm": 0.224609375, + "learning_rate": 0.00019061866034830534, + "loss": 1.7119, + "step": 1787 + }, + { + "epoch": 0.5589246639574867, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019060826725986977, + "loss": 1.6962, + "step": 1788 + }, + { + "epoch": 0.5592372616442638, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019059786870129761, + "loss": 1.6318, + "step": 1789 + }, + { + "epoch": 0.559549859331041, + "grad_norm": 0.21875, + "learning_rate": 0.0001905874646732167, + "loss": 1.8541, + "step": 1790 + }, + { + "epoch": 0.5598624570178181, + "grad_norm": 0.2265625, + "learning_rate": 0.00019057705517625505, + "loss": 1.8081, + "step": 1791 + }, + { + "epoch": 0.5601750547045952, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001905666402110412, + "loss": 1.4779, + "step": 1792 + }, + { + "epoch": 0.5604876523913723, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019055621977820387, + "loss": 1.6657, + "step": 1793 + }, + { + "epoch": 0.5608002500781494, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019054579387837214, + "loss": 1.5665, + "step": 1794 + }, + { + "epoch": 0.5611128477649265, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019053536251217545, + "loss": 1.5586, + "step": 1795 + }, + { + "epoch": 0.5614254454517037, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019052492568024355, + "loss": 1.5323, + "step": 1796 + }, + { + "epoch": 0.5617380431384807, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019051448338320656, + "loss": 1.7868, + "step": 1797 + }, + { + "epoch": 0.5620506408252579, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019050403562169486, + "loss": 1.6351, + "step": 1798 + }, + { + "epoch": 0.562363238512035, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019049358239633916, + "loss": 2.0889, + "step": 1799 + }, + { + "epoch": 0.5626758361988121, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019048312370777062, + "loss": 1.5398, + "step": 1800 + }, + { + "epoch": 0.5629884338855893, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019047265955662054, + "loss": 1.6967, + "step": 1801 + }, + { + "epoch": 0.5633010315723663, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019046218994352076, + "loss": 1.6917, + "step": 1802 + }, + { + "epoch": 0.5636136292591435, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001904517148691033, + "loss": 1.4587, + "step": 1803 + }, + { + "epoch": 0.5639262269459207, + "grad_norm": 0.23046875, + "learning_rate": 0.00019044123433400052, + "loss": 1.8214, + "step": 1804 + }, + { + "epoch": 0.5642388246326977, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001904307483388452, + "loss": 1.6375, + "step": 1805 + }, + { + "epoch": 0.5645514223194749, + "grad_norm": 0.2265625, + "learning_rate": 0.00019042025688427035, + "loss": 1.5963, + "step": 1806 + }, + { + "epoch": 0.5648640200062519, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019040975997090936, + "loss": 1.8623, + "step": 1807 + }, + { + "epoch": 0.5651766176930291, + "grad_norm": 0.220703125, + "learning_rate": 0.00019039925759939597, + "loss": 1.6458, + "step": 1808 + }, + { + "epoch": 0.5654892153798062, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001903887497703642, + "loss": 1.6367, + "step": 1809 + }, + { + "epoch": 0.5658018130665833, + "grad_norm": 0.216796875, + "learning_rate": 0.00019037823648444842, + "loss": 1.6211, + "step": 1810 + }, + { + "epoch": 0.5661144107533604, + "grad_norm": 0.220703125, + "learning_rate": 0.0001903677177422833, + "loss": 1.5955, + "step": 1811 + }, + { + "epoch": 0.5664270084401375, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019035719354450393, + "loss": 1.6509, + "step": 1812 + }, + { + "epoch": 0.5667396061269147, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019034666389174568, + "loss": 1.5193, + "step": 1813 + }, + { + "epoch": 0.5670522038136918, + "grad_norm": 0.23046875, + "learning_rate": 0.00019033612878464412, + "loss": 1.8779, + "step": 1814 + }, + { + "epoch": 0.5673648015004689, + "grad_norm": 0.232421875, + "learning_rate": 0.00019032558822383542, + "loss": 1.746, + "step": 1815 + }, + { + "epoch": 0.567677399187246, + "grad_norm": 0.21875, + "learning_rate": 0.0001903150422099558, + "loss": 1.6802, + "step": 1816 + }, + { + "epoch": 0.5679899968740232, + "grad_norm": 0.2373046875, + "learning_rate": 0.00019030449074364204, + "loss": 1.8168, + "step": 1817 + }, + { + "epoch": 0.5683025945608002, + "grad_norm": 0.23046875, + "learning_rate": 0.00019029393382553108, + "loss": 1.6261, + "step": 1818 + }, + { + "epoch": 0.5686151922475774, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019028337145626028, + "loss": 1.6126, + "step": 1819 + }, + { + "epoch": 0.5689277899343544, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019027280363646728, + "loss": 1.7607, + "step": 1820 + }, + { + "epoch": 0.5692403876211316, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001902622303667901, + "loss": 1.6267, + "step": 1821 + }, + { + "epoch": 0.5695529853079088, + "grad_norm": 0.22265625, + "learning_rate": 0.00019025165164786705, + "loss": 1.7209, + "step": 1822 + }, + { + "epoch": 0.5698655829946858, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019024106748033679, + "loss": 1.4932, + "step": 1823 + }, + { + "epoch": 0.570178180681463, + "grad_norm": 0.236328125, + "learning_rate": 0.00019023047786483828, + "loss": 1.4764, + "step": 1824 + }, + { + "epoch": 0.57049077836824, + "grad_norm": 0.228515625, + "learning_rate": 0.00019021988280201084, + "loss": 1.6664, + "step": 1825 + }, + { + "epoch": 0.5708033760550172, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001902092822924941, + "loss": 1.5628, + "step": 1826 + }, + { + "epoch": 0.5711159737417943, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019019867633692802, + "loss": 1.8942, + "step": 1827 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019018806493595293, + "loss": 1.5664, + "step": 1828 + }, + { + "epoch": 0.5717411691153486, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019017744809020942, + "loss": 1.4663, + "step": 1829 + }, + { + "epoch": 0.5720537668021257, + "grad_norm": 0.22265625, + "learning_rate": 0.00019016682580033848, + "loss": 1.8574, + "step": 1830 + }, + { + "epoch": 0.5723663644889028, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019015619806698135, + "loss": 1.7824, + "step": 1831 + }, + { + "epoch": 0.5726789621756799, + "grad_norm": 0.224609375, + "learning_rate": 0.00019014556489077965, + "loss": 1.5226, + "step": 1832 + }, + { + "epoch": 0.572991559862457, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019013492627237532, + "loss": 1.8333, + "step": 1833 + }, + { + "epoch": 0.5733041575492341, + "grad_norm": 0.2236328125, + "learning_rate": 0.00019012428221241065, + "loss": 1.5824, + "step": 1834 + }, + { + "epoch": 0.5736167552360113, + "grad_norm": 0.232421875, + "learning_rate": 0.00019011363271152822, + "loss": 1.7483, + "step": 1835 + }, + { + "epoch": 0.5739293529227883, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019010297777037093, + "loss": 1.6215, + "step": 1836 + }, + { + "epoch": 0.5742419506095655, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019009231738958206, + "loss": 1.6124, + "step": 1837 + }, + { + "epoch": 0.5745545482963426, + "grad_norm": 0.234375, + "learning_rate": 0.00019008165156980517, + "loss": 1.8104, + "step": 1838 + }, + { + "epoch": 0.5748671459831197, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001900709803116842, + "loss": 1.7839, + "step": 1839 + }, + { + "epoch": 0.5751797436698969, + "grad_norm": 0.216796875, + "learning_rate": 0.0001900603036158634, + "loss": 1.6926, + "step": 1840 + }, + { + "epoch": 0.5754923413566739, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019004962148298725, + "loss": 1.8372, + "step": 1841 + }, + { + "epoch": 0.5758049390434511, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001900389339137007, + "loss": 1.5496, + "step": 1842 + }, + { + "epoch": 0.5761175367302281, + "grad_norm": 0.2275390625, + "learning_rate": 0.000190028240908649, + "loss": 1.7024, + "step": 1843 + }, + { + "epoch": 0.5764301344170053, + "grad_norm": 0.236328125, + "learning_rate": 0.00019001754246847767, + "loss": 1.6237, + "step": 1844 + }, + { + "epoch": 0.5767427321037825, + "grad_norm": 0.23046875, + "learning_rate": 0.00019000683859383258, + "loss": 1.6012, + "step": 1845 + }, + { + "epoch": 0.5770553297905595, + "grad_norm": 0.2119140625, + "learning_rate": 0.00018999612928535995, + "loss": 1.7586, + "step": 1846 + }, + { + "epoch": 0.5773679274773367, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018998541454370632, + "loss": 1.4823, + "step": 1847 + }, + { + "epoch": 0.5776805251641138, + "grad_norm": 0.2265625, + "learning_rate": 0.00018997469436951854, + "loss": 1.5688, + "step": 1848 + }, + { + "epoch": 0.5779931228508909, + "grad_norm": 0.318359375, + "learning_rate": 0.0001899639687634438, + "loss": 2.5108, + "step": 1849 + }, + { + "epoch": 0.578305720537668, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018995323772612964, + "loss": 1.6868, + "step": 1850 + }, + { + "epoch": 0.5786183182244451, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018994250125822386, + "loss": 1.6238, + "step": 1851 + }, + { + "epoch": 0.5789309159112223, + "grad_norm": 0.220703125, + "learning_rate": 0.0001899317593603747, + "loss": 1.5826, + "step": 1852 + }, + { + "epoch": 0.5792435135979994, + "grad_norm": 0.2265625, + "learning_rate": 0.0001899210120332306, + "loss": 1.6792, + "step": 1853 + }, + { + "epoch": 0.5795561112847765, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018991025927744042, + "loss": 1.8574, + "step": 1854 + }, + { + "epoch": 0.5798687089715536, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001898995010936533, + "loss": 1.7686, + "step": 1855 + }, + { + "epoch": 0.5801813066583307, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018988873748251877, + "loss": 1.7198, + "step": 1856 + }, + { + "epoch": 0.5804939043451078, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018987796844468658, + "loss": 1.7134, + "step": 1857 + }, + { + "epoch": 0.580806502031885, + "grad_norm": 0.212890625, + "learning_rate": 0.00018986719398080695, + "loss": 1.5788, + "step": 1858 + }, + { + "epoch": 0.581119099718662, + "grad_norm": 0.2265625, + "learning_rate": 0.00018985641409153026, + "loss": 1.6557, + "step": 1859 + }, + { + "epoch": 0.5814316974054392, + "grad_norm": 0.23046875, + "learning_rate": 0.00018984562877750737, + "loss": 1.719, + "step": 1860 + }, + { + "epoch": 0.5817442950922164, + "grad_norm": 0.2265625, + "learning_rate": 0.00018983483803938932, + "loss": 1.7116, + "step": 1861 + }, + { + "epoch": 0.5820568927789934, + "grad_norm": 0.236328125, + "learning_rate": 0.0001898240418778277, + "loss": 1.9006, + "step": 1862 + }, + { + "epoch": 0.5823694904657706, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018981324029347416, + "loss": 1.3191, + "step": 1863 + }, + { + "epoch": 0.5826820881525476, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018980243328698088, + "loss": 1.7602, + "step": 1864 + }, + { + "epoch": 0.5829946858393248, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018979162085900025, + "loss": 2.0473, + "step": 1865 + }, + { + "epoch": 0.583307283526102, + "grad_norm": 0.234375, + "learning_rate": 0.00018978080301018503, + "loss": 1.7591, + "step": 1866 + }, + { + "epoch": 0.583619881212879, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018976997974118836, + "loss": 1.9532, + "step": 1867 + }, + { + "epoch": 0.5839324788996562, + "grad_norm": 0.234375, + "learning_rate": 0.0001897591510526636, + "loss": 1.8456, + "step": 1868 + }, + { + "epoch": 0.5842450765864332, + "grad_norm": 0.23046875, + "learning_rate": 0.00018974831694526452, + "loss": 1.7148, + "step": 1869 + }, + { + "epoch": 0.5845576742732104, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018973747741964515, + "loss": 1.6221, + "step": 1870 + }, + { + "epoch": 0.5848702719599875, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018972663247645994, + "loss": 2.0677, + "step": 1871 + }, + { + "epoch": 0.5851828696467646, + "grad_norm": 0.2421875, + "learning_rate": 0.00018971578211636359, + "loss": 1.4428, + "step": 1872 + }, + { + "epoch": 0.5854954673335417, + "grad_norm": 0.224609375, + "learning_rate": 0.00018970492634001114, + "loss": 1.6225, + "step": 1873 + }, + { + "epoch": 0.5858080650203189, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018969406514805797, + "loss": 1.5286, + "step": 1874 + }, + { + "epoch": 0.586120662707096, + "grad_norm": 0.2421875, + "learning_rate": 0.00018968319854115978, + "loss": 1.7499, + "step": 1875 + }, + { + "epoch": 0.5864332603938731, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018967232651997265, + "loss": 1.9038, + "step": 1876 + }, + { + "epoch": 0.5867458580806502, + "grad_norm": 0.21484375, + "learning_rate": 0.00018966144908515284, + "loss": 1.5464, + "step": 1877 + }, + { + "epoch": 0.5870584557674273, + "grad_norm": 0.23046875, + "learning_rate": 0.00018965056623735713, + "loss": 1.6405, + "step": 1878 + }, + { + "epoch": 0.5873710534542045, + "grad_norm": 0.2099609375, + "learning_rate": 0.00018963967797724248, + "loss": 1.727, + "step": 1879 + }, + { + "epoch": 0.5876836511409815, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018962878430546626, + "loss": 1.7438, + "step": 1880 + }, + { + "epoch": 0.5879962488277587, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001896178852226861, + "loss": 1.6973, + "step": 1881 + }, + { + "epoch": 0.5883088465145357, + "grad_norm": 0.228515625, + "learning_rate": 0.00018960698072956, + "loss": 1.7813, + "step": 1882 + }, + { + "epoch": 0.5886214442013129, + "grad_norm": 0.224609375, + "learning_rate": 0.00018959607082674632, + "loss": 1.8691, + "step": 1883 + }, + { + "epoch": 0.5889340418880901, + "grad_norm": 0.2265625, + "learning_rate": 0.00018958515551490364, + "loss": 1.8186, + "step": 1884 + }, + { + "epoch": 0.5892466395748671, + "grad_norm": 0.224609375, + "learning_rate": 0.00018957423479469096, + "loss": 1.6628, + "step": 1885 + }, + { + "epoch": 0.5895592372616443, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001895633086667676, + "loss": 1.8004, + "step": 1886 + }, + { + "epoch": 0.5898718349484214, + "grad_norm": 0.24609375, + "learning_rate": 0.00018955237713179314, + "loss": 1.781, + "step": 1887 + }, + { + "epoch": 0.5901844326351985, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018954144019042759, + "loss": 1.7539, + "step": 1888 + }, + { + "epoch": 0.5904970303219756, + "grad_norm": 0.23046875, + "learning_rate": 0.00018953049784333116, + "loss": 1.6668, + "step": 1889 + }, + { + "epoch": 0.5908096280087527, + "grad_norm": 0.228515625, + "learning_rate": 0.00018951955009116449, + "loss": 1.954, + "step": 1890 + }, + { + "epoch": 0.5911222256955299, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001895085969345885, + "loss": 1.8232, + "step": 1891 + }, + { + "epoch": 0.591434823382307, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018949763837426445, + "loss": 1.5966, + "step": 1892 + }, + { + "epoch": 0.5917474210690841, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018948667441085398, + "loss": 1.5623, + "step": 1893 + }, + { + "epoch": 0.5920600187558612, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018947570504501888, + "loss": 1.689, + "step": 1894 + }, + { + "epoch": 0.5923726164426383, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018946473027742146, + "loss": 1.6939, + "step": 1895 + }, + { + "epoch": 0.5926852141294154, + "grad_norm": 0.228515625, + "learning_rate": 0.00018945375010872426, + "loss": 1.7252, + "step": 1896 + }, + { + "epoch": 0.5929978118161926, + "grad_norm": 0.220703125, + "learning_rate": 0.0001894427645395902, + "loss": 1.7894, + "step": 1897 + }, + { + "epoch": 0.5933104095029696, + "grad_norm": 0.234375, + "learning_rate": 0.00018943177357068244, + "loss": 1.8643, + "step": 1898 + }, + { + "epoch": 0.5936230071897468, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018942077720266454, + "loss": 1.6017, + "step": 1899 + }, + { + "epoch": 0.5939356048765239, + "grad_norm": 0.22265625, + "learning_rate": 0.0001894097754362004, + "loss": 1.514, + "step": 1900 + }, + { + "epoch": 0.594248202563301, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018939876827195418, + "loss": 1.8716, + "step": 1901 + }, + { + "epoch": 0.5945608002500782, + "grad_norm": 0.232421875, + "learning_rate": 0.00018938775571059039, + "loss": 1.8103, + "step": 1902 + }, + { + "epoch": 0.5948733979368552, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018937673775277388, + "loss": 1.5777, + "step": 1903 + }, + { + "epoch": 0.5951859956236324, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001893657143991698, + "loss": 1.6428, + "step": 1904 + }, + { + "epoch": 0.5954985933104096, + "grad_norm": 0.224609375, + "learning_rate": 0.00018935468565044368, + "loss": 2.0165, + "step": 1905 + }, + { + "epoch": 0.5958111909971866, + "grad_norm": 0.22265625, + "learning_rate": 0.00018934365150726133, + "loss": 1.5724, + "step": 1906 + }, + { + "epoch": 0.5961237886839638, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018933261197028885, + "loss": 1.9301, + "step": 1907 + }, + { + "epoch": 0.5964363863707408, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001893215670401928, + "loss": 1.6571, + "step": 1908 + }, + { + "epoch": 0.596748984057518, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018931051671763988, + "loss": 1.7479, + "step": 1909 + }, + { + "epoch": 0.5970615817442951, + "grad_norm": 0.228515625, + "learning_rate": 0.00018929946100329725, + "loss": 1.6891, + "step": 1910 + }, + { + "epoch": 0.5973741794310722, + "grad_norm": 0.23046875, + "learning_rate": 0.0001892883998978324, + "loss": 1.646, + "step": 1911 + }, + { + "epoch": 0.5976867771178493, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018927733340191308, + "loss": 1.6963, + "step": 1912 + }, + { + "epoch": 0.5979993748046264, + "grad_norm": 0.2265625, + "learning_rate": 0.00018926626151620732, + "loss": 1.9789, + "step": 1913 + }, + { + "epoch": 0.5983119724914036, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018925518424138361, + "loss": 1.9244, + "step": 1914 + }, + { + "epoch": 0.5986245701781807, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018924410157811073, + "loss": 1.5019, + "step": 1915 + }, + { + "epoch": 0.5989371678649578, + "grad_norm": 0.2265625, + "learning_rate": 0.0001892330135270577, + "loss": 1.7337, + "step": 1916 + }, + { + "epoch": 0.5992497655517349, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001892219200888939, + "loss": 1.6027, + "step": 1917 + }, + { + "epoch": 0.5995623632385121, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018921082126428912, + "loss": 1.6431, + "step": 1918 + }, + { + "epoch": 0.5998749609252891, + "grad_norm": 0.22265625, + "learning_rate": 0.00018919971705391335, + "loss": 1.822, + "step": 1919 + }, + { + "epoch": 0.6001875586120663, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018918860745843703, + "loss": 1.656, + "step": 1920 + }, + { + "epoch": 0.6005001562988433, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018917749247853078, + "loss": 1.6685, + "step": 1921 + }, + { + "epoch": 0.6008127539856205, + "grad_norm": 0.21875, + "learning_rate": 0.0001891663721148657, + "loss": 1.8229, + "step": 1922 + }, + { + "epoch": 0.6011253516723977, + "grad_norm": 0.228515625, + "learning_rate": 0.0001891552463681131, + "loss": 1.7224, + "step": 1923 + }, + { + "epoch": 0.6014379493591747, + "grad_norm": 0.23046875, + "learning_rate": 0.00018914411523894467, + "loss": 1.9986, + "step": 1924 + }, + { + "epoch": 0.6017505470459519, + "grad_norm": 0.22265625, + "learning_rate": 0.0001891329787280324, + "loss": 1.4848, + "step": 1925 + }, + { + "epoch": 0.6020631447327289, + "grad_norm": 0.224609375, + "learning_rate": 0.00018912183683604864, + "loss": 1.7737, + "step": 1926 + }, + { + "epoch": 0.6023757424195061, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018911068956366597, + "loss": 1.7155, + "step": 1927 + }, + { + "epoch": 0.6026883401062832, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018909953691155745, + "loss": 1.7669, + "step": 1928 + }, + { + "epoch": 0.6030009377930603, + "grad_norm": 0.232421875, + "learning_rate": 0.00018908837888039637, + "loss": 1.8628, + "step": 1929 + }, + { + "epoch": 0.6033135354798375, + "grad_norm": 0.23046875, + "learning_rate": 0.0001890772154708563, + "loss": 1.7606, + "step": 1930 + }, + { + "epoch": 0.6036261331666146, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001890660466836112, + "loss": 1.5453, + "step": 1931 + }, + { + "epoch": 0.6039387308533917, + "grad_norm": 0.236328125, + "learning_rate": 0.00018905487251933542, + "loss": 1.7034, + "step": 1932 + }, + { + "epoch": 0.6042513285401688, + "grad_norm": 0.23046875, + "learning_rate": 0.00018904369297870349, + "loss": 1.6582, + "step": 1933 + }, + { + "epoch": 0.6045639262269459, + "grad_norm": 0.2421875, + "learning_rate": 0.0001890325080623903, + "loss": 1.5893, + "step": 1934 + }, + { + "epoch": 0.604876523913723, + "grad_norm": 0.220703125, + "learning_rate": 0.00018902131777107117, + "loss": 1.602, + "step": 1935 + }, + { + "epoch": 0.6051891216005002, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018901012210542165, + "loss": 1.636, + "step": 1936 + }, + { + "epoch": 0.6055017192872773, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018899892106611762, + "loss": 1.8495, + "step": 1937 + }, + { + "epoch": 0.6058143169740544, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018898771465383532, + "loss": 1.7294, + "step": 1938 + }, + { + "epoch": 0.6061269146608315, + "grad_norm": 0.23046875, + "learning_rate": 0.0001889765028692513, + "loss": 1.6063, + "step": 1939 + }, + { + "epoch": 0.6064395123476086, + "grad_norm": 0.23046875, + "learning_rate": 0.0001889652857130424, + "loss": 1.5972, + "step": 1940 + }, + { + "epoch": 0.6067521100343858, + "grad_norm": 0.224609375, + "learning_rate": 0.00018895406318588585, + "loss": 1.9705, + "step": 1941 + }, + { + "epoch": 0.6070647077211628, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018894283528845914, + "loss": 1.9463, + "step": 1942 + }, + { + "epoch": 0.60737730540794, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018893160202144012, + "loss": 1.7365, + "step": 1943 + }, + { + "epoch": 0.6076899030947172, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018892036338550696, + "loss": 1.6313, + "step": 1944 + }, + { + "epoch": 0.6080025007814942, + "grad_norm": 0.228515625, + "learning_rate": 0.00018890911938133814, + "loss": 1.7297, + "step": 1945 + }, + { + "epoch": 0.6083150984682714, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001888978700096125, + "loss": 1.5932, + "step": 1946 + }, + { + "epoch": 0.6086276961550484, + "grad_norm": 0.255859375, + "learning_rate": 0.00018888661527100914, + "loss": 1.7416, + "step": 1947 + }, + { + "epoch": 0.6089402938418256, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001888753551662076, + "loss": 1.5615, + "step": 1948 + }, + { + "epoch": 0.6092528915286027, + "grad_norm": 0.21875, + "learning_rate": 0.00018886408969588756, + "loss": 1.9525, + "step": 1949 + }, + { + "epoch": 0.6095654892153798, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001888528188607292, + "loss": 1.4709, + "step": 1950 + }, + { + "epoch": 0.609878086902157, + "grad_norm": 0.2265625, + "learning_rate": 0.00018884154266141296, + "loss": 1.6341, + "step": 1951 + }, + { + "epoch": 0.610190684588934, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018883026109861955, + "loss": 1.6915, + "step": 1952 + }, + { + "epoch": 0.6105032822757112, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001888189741730301, + "loss": 1.7387, + "step": 1953 + }, + { + "epoch": 0.6108158799624883, + "grad_norm": 0.23828125, + "learning_rate": 0.000188807681885326, + "loss": 1.4454, + "step": 1954 + }, + { + "epoch": 0.6111284776492654, + "grad_norm": 0.22265625, + "learning_rate": 0.00018879638423618893, + "loss": 1.644, + "step": 1955 + }, + { + "epoch": 0.6114410753360425, + "grad_norm": 0.2265625, + "learning_rate": 0.00018878508122630106, + "loss": 1.6955, + "step": 1956 + }, + { + "epoch": 0.6117536730228196, + "grad_norm": 0.228515625, + "learning_rate": 0.00018877377285634464, + "loss": 1.5826, + "step": 1957 + }, + { + "epoch": 0.6120662707095967, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018876245912700243, + "loss": 1.7957, + "step": 1958 + }, + { + "epoch": 0.6123788683963739, + "grad_norm": 0.23046875, + "learning_rate": 0.00018875114003895748, + "loss": 1.5181, + "step": 1959 + }, + { + "epoch": 0.612691466083151, + "grad_norm": 0.23046875, + "learning_rate": 0.00018873981559289308, + "loss": 1.7115, + "step": 1960 + }, + { + "epoch": 0.6130040637699281, + "grad_norm": 0.236328125, + "learning_rate": 0.00018872848578949296, + "loss": 1.9347, + "step": 1961 + }, + { + "epoch": 0.6133166614567053, + "grad_norm": 0.23046875, + "learning_rate": 0.00018871715062944108, + "loss": 1.7506, + "step": 1962 + }, + { + "epoch": 0.6136292591434823, + "grad_norm": 0.29296875, + "learning_rate": 0.00018870581011342174, + "loss": 2.3271, + "step": 1963 + }, + { + "epoch": 0.6139418568302595, + "grad_norm": 0.228515625, + "learning_rate": 0.00018869446424211962, + "loss": 2.0109, + "step": 1964 + }, + { + "epoch": 0.6142544545170365, + "grad_norm": 0.23046875, + "learning_rate": 0.00018868311301621968, + "loss": 1.5306, + "step": 1965 + }, + { + "epoch": 0.6145670522038137, + "grad_norm": 0.224609375, + "learning_rate": 0.00018867175643640717, + "loss": 1.7745, + "step": 1966 + }, + { + "epoch": 0.6148796498905909, + "grad_norm": 0.23046875, + "learning_rate": 0.00018866039450336777, + "loss": 1.7684, + "step": 1967 + }, + { + "epoch": 0.6151922475773679, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018864902721778734, + "loss": 1.738, + "step": 1968 + }, + { + "epoch": 0.6155048452641451, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018863765458035218, + "loss": 1.6707, + "step": 1969 + }, + { + "epoch": 0.6158174429509221, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018862627659174886, + "loss": 1.5577, + "step": 1970 + }, + { + "epoch": 0.6161300406376993, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018861489325266425, + "loss": 1.6428, + "step": 1971 + }, + { + "epoch": 0.6164426383244764, + "grad_norm": 0.2421875, + "learning_rate": 0.00018860350456378566, + "loss": 1.5885, + "step": 1972 + }, + { + "epoch": 0.6167552360112535, + "grad_norm": 0.21875, + "learning_rate": 0.00018859211052580057, + "loss": 1.3899, + "step": 1973 + }, + { + "epoch": 0.6170678336980306, + "grad_norm": 0.23046875, + "learning_rate": 0.0001885807111393969, + "loss": 1.8002, + "step": 1974 + }, + { + "epoch": 0.6173804313848078, + "grad_norm": 0.2265625, + "learning_rate": 0.0001885693064052628, + "loss": 1.7554, + "step": 1975 + }, + { + "epoch": 0.6176930290715849, + "grad_norm": 0.22265625, + "learning_rate": 0.0001885578963240868, + "loss": 1.5717, + "step": 1976 + }, + { + "epoch": 0.618005626758362, + "grad_norm": 0.228515625, + "learning_rate": 0.00018854648089655776, + "loss": 1.6693, + "step": 1977 + }, + { + "epoch": 0.6183182244451391, + "grad_norm": 0.2265625, + "learning_rate": 0.00018853506012336482, + "loss": 1.8787, + "step": 1978 + }, + { + "epoch": 0.6186308221319162, + "grad_norm": 0.220703125, + "learning_rate": 0.00018852363400519745, + "loss": 1.6435, + "step": 1979 + }, + { + "epoch": 0.6189434198186934, + "grad_norm": 0.224609375, + "learning_rate": 0.00018851220254274554, + "loss": 1.7522, + "step": 1980 + }, + { + "epoch": 0.6192560175054704, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018850076573669915, + "loss": 1.5828, + "step": 1981 + }, + { + "epoch": 0.6195686151922476, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001884893235877488, + "loss": 1.457, + "step": 1982 + }, + { + "epoch": 0.6198812128790246, + "grad_norm": 0.22265625, + "learning_rate": 0.00018847787609658516, + "loss": 1.5991, + "step": 1983 + }, + { + "epoch": 0.6201938105658018, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001884664232638994, + "loss": 1.598, + "step": 1984 + }, + { + "epoch": 0.620506408252579, + "grad_norm": 0.228515625, + "learning_rate": 0.00018845496509038294, + "loss": 1.6774, + "step": 1985 + }, + { + "epoch": 0.620819005939356, + "grad_norm": 0.220703125, + "learning_rate": 0.00018844350157672755, + "loss": 1.7232, + "step": 1986 + }, + { + "epoch": 0.6211316036261332, + "grad_norm": 0.228515625, + "learning_rate": 0.00018843203272362523, + "loss": 1.7184, + "step": 1987 + }, + { + "epoch": 0.6214442013129103, + "grad_norm": 0.22265625, + "learning_rate": 0.00018842055853176838, + "loss": 1.6561, + "step": 1988 + }, + { + "epoch": 0.6217567989996874, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001884090790018498, + "loss": 1.5792, + "step": 1989 + }, + { + "epoch": 0.6220693966864645, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001883975941345624, + "loss": 1.9449, + "step": 1990 + }, + { + "epoch": 0.6223819943732416, + "grad_norm": 0.251953125, + "learning_rate": 0.00018838610393059964, + "loss": 2.1031, + "step": 1991 + }, + { + "epoch": 0.6226945920600188, + "grad_norm": 0.228515625, + "learning_rate": 0.00018837460839065515, + "loss": 1.9063, + "step": 1992 + }, + { + "epoch": 0.6230071897467959, + "grad_norm": 0.25390625, + "learning_rate": 0.0001883631075154229, + "loss": 2.1289, + "step": 1993 + }, + { + "epoch": 0.623319787433573, + "grad_norm": 0.23828125, + "learning_rate": 0.0001883516013055973, + "loss": 2.0025, + "step": 1994 + }, + { + "epoch": 0.6236323851203501, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001883400897618729, + "loss": 1.8512, + "step": 1995 + }, + { + "epoch": 0.6239449828071272, + "grad_norm": 0.236328125, + "learning_rate": 0.0001883285728849447, + "loss": 1.8326, + "step": 1996 + }, + { + "epoch": 0.6242575804939043, + "grad_norm": 0.224609375, + "learning_rate": 0.00018831705067550805, + "loss": 1.6852, + "step": 1997 + }, + { + "epoch": 0.6245701781806815, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018830552313425845, + "loss": 1.8256, + "step": 1998 + }, + { + "epoch": 0.6248827758674586, + "grad_norm": 0.23046875, + "learning_rate": 0.0001882939902618919, + "loss": 1.6083, + "step": 1999 + }, + { + "epoch": 0.6251953735542357, + "grad_norm": 0.224609375, + "learning_rate": 0.00018828245205910465, + "loss": 1.7561, + "step": 2000 + }, + { + "epoch": 0.6255079712410128, + "grad_norm": 0.2421875, + "learning_rate": 0.0001882709085265933, + "loss": 1.7635, + "step": 2001 + }, + { + "epoch": 0.6258205689277899, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001882593596650547, + "loss": 1.8553, + "step": 2002 + }, + { + "epoch": 0.6261331666145671, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001882478054751861, + "loss": 1.6012, + "step": 2003 + }, + { + "epoch": 0.6264457643013441, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018823624595768498, + "loss": 1.8742, + "step": 2004 + }, + { + "epoch": 0.6267583619881213, + "grad_norm": 0.23828125, + "learning_rate": 0.0001882246811132493, + "loss": 1.2608, + "step": 2005 + }, + { + "epoch": 0.6270709596748985, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018821311094257716, + "loss": 1.5808, + "step": 2006 + }, + { + "epoch": 0.6273835573616755, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018820153544636713, + "loss": 1.6451, + "step": 2007 + }, + { + "epoch": 0.6276961550484527, + "grad_norm": 0.2392578125, + "learning_rate": 0.000188189954625318, + "loss": 1.6479, + "step": 2008 + }, + { + "epoch": 0.6280087527352297, + "grad_norm": 0.23046875, + "learning_rate": 0.0001881783684801289, + "loss": 1.6755, + "step": 2009 + }, + { + "epoch": 0.6283213504220069, + "grad_norm": 0.228515625, + "learning_rate": 0.00018816677701149939, + "loss": 1.6337, + "step": 2010 + }, + { + "epoch": 0.628633948108784, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018815518022012915, + "loss": 1.648, + "step": 2011 + }, + { + "epoch": 0.6289465457955611, + "grad_norm": 0.234375, + "learning_rate": 0.00018814357810671833, + "loss": 1.586, + "step": 2012 + }, + { + "epoch": 0.6292591434823382, + "grad_norm": 0.232421875, + "learning_rate": 0.0001881319706719674, + "loss": 1.5722, + "step": 2013 + }, + { + "epoch": 0.6295717411691153, + "grad_norm": 0.251953125, + "learning_rate": 0.0001881203579165771, + "loss": 1.946, + "step": 2014 + }, + { + "epoch": 0.6298843388558925, + "grad_norm": 0.228515625, + "learning_rate": 0.0001881087398412485, + "loss": 1.7869, + "step": 2015 + }, + { + "epoch": 0.6301969365426696, + "grad_norm": 0.21875, + "learning_rate": 0.000188097116446683, + "loss": 1.7194, + "step": 2016 + }, + { + "epoch": 0.6305095342294467, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001880854877335823, + "loss": 2.0099, + "step": 2017 + }, + { + "epoch": 0.6308221319162238, + "grad_norm": 0.228515625, + "learning_rate": 0.00018807385370264848, + "loss": 1.8415, + "step": 2018 + }, + { + "epoch": 0.631134729603001, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018806221435458388, + "loss": 1.6398, + "step": 2019 + }, + { + "epoch": 0.631447327289778, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018805056969009115, + "loss": 1.8436, + "step": 2020 + }, + { + "epoch": 0.6317599249765552, + "grad_norm": 0.22265625, + "learning_rate": 0.00018803891970987333, + "loss": 1.5016, + "step": 2021 + }, + { + "epoch": 0.6320725226633322, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018802726441463375, + "loss": 1.5147, + "step": 2022 + }, + { + "epoch": 0.6323851203501094, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018801560380507604, + "loss": 1.5146, + "step": 2023 + }, + { + "epoch": 0.6326977180368866, + "grad_norm": 0.224609375, + "learning_rate": 0.00018800393788190415, + "loss": 1.8504, + "step": 2024 + }, + { + "epoch": 0.6330103157236636, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018799226664582245, + "loss": 1.6024, + "step": 2025 + }, + { + "epoch": 0.6333229134104408, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018798059009753542, + "loss": 1.8456, + "step": 2026 + }, + { + "epoch": 0.6336355110972178, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018796890823774806, + "loss": 1.5829, + "step": 2027 + }, + { + "epoch": 0.633948108783995, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018795722106716562, + "loss": 1.8332, + "step": 2028 + }, + { + "epoch": 0.6342607064707722, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018794552858649366, + "loss": 1.8867, + "step": 2029 + }, + { + "epoch": 0.6345733041575492, + "grad_norm": 0.23828125, + "learning_rate": 0.00018793383079643804, + "loss": 1.7046, + "step": 2030 + }, + { + "epoch": 0.6348859018443264, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018792212769770507, + "loss": 1.4539, + "step": 2031 + }, + { + "epoch": 0.6351984995311035, + "grad_norm": 0.224609375, + "learning_rate": 0.00018791041929100115, + "loss": 1.7966, + "step": 2032 + }, + { + "epoch": 0.6355110972178806, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001878987055770332, + "loss": 1.7888, + "step": 2033 + }, + { + "epoch": 0.6358236949046577, + "grad_norm": 0.24609375, + "learning_rate": 0.0001878869865565084, + "loss": 1.5578, + "step": 2034 + }, + { + "epoch": 0.6361362925914348, + "grad_norm": 0.228515625, + "learning_rate": 0.0001878752622301342, + "loss": 1.7211, + "step": 2035 + }, + { + "epoch": 0.6364488902782119, + "grad_norm": 0.228515625, + "learning_rate": 0.00018786353259861847, + "loss": 1.5837, + "step": 2036 + }, + { + "epoch": 0.6367614879649891, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001878517976626693, + "loss": 1.6654, + "step": 2037 + }, + { + "epoch": 0.6370740856517662, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018784005742299514, + "loss": 1.9085, + "step": 2038 + }, + { + "epoch": 0.6373866833385433, + "grad_norm": 0.275390625, + "learning_rate": 0.0001878283118803048, + "loss": 1.6215, + "step": 2039 + }, + { + "epoch": 0.6376992810253204, + "grad_norm": 0.240234375, + "learning_rate": 0.00018781656103530737, + "loss": 1.9168, + "step": 2040 + }, + { + "epoch": 0.6380118787120975, + "grad_norm": 0.224609375, + "learning_rate": 0.0001878048048887122, + "loss": 1.8944, + "step": 2041 + }, + { + "epoch": 0.6383244763988747, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018779304344122908, + "loss": 1.7528, + "step": 2042 + }, + { + "epoch": 0.6386370740856517, + "grad_norm": 0.228515625, + "learning_rate": 0.00018778127669356805, + "loss": 1.8204, + "step": 2043 + }, + { + "epoch": 0.6389496717724289, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001877695046464395, + "loss": 1.7069, + "step": 2044 + }, + { + "epoch": 0.6392622694592061, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001877577273005541, + "loss": 1.3533, + "step": 2045 + }, + { + "epoch": 0.6395748671459831, + "grad_norm": 0.22265625, + "learning_rate": 0.00018774594465662288, + "loss": 1.6023, + "step": 2046 + }, + { + "epoch": 0.6398874648327603, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018773415671535714, + "loss": 1.9426, + "step": 2047 + }, + { + "epoch": 0.6402000625195373, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018772236347746856, + "loss": 1.7982, + "step": 2048 + }, + { + "epoch": 0.6405126602063145, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018771056494366913, + "loss": 1.7041, + "step": 2049 + }, + { + "epoch": 0.6408252578930916, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018769876111467113, + "loss": 1.7406, + "step": 2050 + }, + { + "epoch": 0.6411378555798687, + "grad_norm": 0.240234375, + "learning_rate": 0.00018768695199118717, + "loss": 1.6077, + "step": 2051 + }, + { + "epoch": 0.6414504532666458, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018767513757393016, + "loss": 1.7813, + "step": 2052 + }, + { + "epoch": 0.6417630509534229, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018766331786361338, + "loss": 1.6976, + "step": 2053 + }, + { + "epoch": 0.6420756486402001, + "grad_norm": 0.2421875, + "learning_rate": 0.00018765149286095037, + "loss": 1.6368, + "step": 2054 + }, + { + "epoch": 0.6423882463269772, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018763966256665505, + "loss": 1.6045, + "step": 2055 + }, + { + "epoch": 0.6427008440137543, + "grad_norm": 0.22265625, + "learning_rate": 0.00018762782698144163, + "loss": 1.5185, + "step": 2056 + }, + { + "epoch": 0.6430134417005314, + "grad_norm": 0.23828125, + "learning_rate": 0.00018761598610602463, + "loss": 1.5806, + "step": 2057 + }, + { + "epoch": 0.6433260393873085, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001876041399411189, + "loss": 1.6609, + "step": 2058 + }, + { + "epoch": 0.6436386370740856, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001875922884874396, + "loss": 1.6643, + "step": 2059 + }, + { + "epoch": 0.6439512347608628, + "grad_norm": 0.236328125, + "learning_rate": 0.00018758043174570222, + "loss": 1.5697, + "step": 2060 + }, + { + "epoch": 0.6442638324476399, + "grad_norm": 0.22265625, + "learning_rate": 0.00018756856971662258, + "loss": 1.6761, + "step": 2061 + }, + { + "epoch": 0.644576430134417, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018755670240091677, + "loss": 1.5763, + "step": 2062 + }, + { + "epoch": 0.6448890278211942, + "grad_norm": 0.240234375, + "learning_rate": 0.0001875448297993013, + "loss": 1.7233, + "step": 2063 + }, + { + "epoch": 0.6452016255079712, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018753295191249286, + "loss": 1.623, + "step": 2064 + }, + { + "epoch": 0.6455142231947484, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018752106874120862, + "loss": 1.5065, + "step": 2065 + }, + { + "epoch": 0.6458268208815254, + "grad_norm": 0.251953125, + "learning_rate": 0.0001875091802861659, + "loss": 2.0689, + "step": 2066 + }, + { + "epoch": 0.6461394185683026, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018749728654808242, + "loss": 1.7316, + "step": 2067 + }, + { + "epoch": 0.6464520162550798, + "grad_norm": 0.224609375, + "learning_rate": 0.0001874853875276763, + "loss": 1.7759, + "step": 2068 + }, + { + "epoch": 0.6467646139418568, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018747348322566582, + "loss": 1.6177, + "step": 2069 + }, + { + "epoch": 0.647077211628634, + "grad_norm": 0.244140625, + "learning_rate": 0.0001874615736427697, + "loss": 1.8813, + "step": 2070 + }, + { + "epoch": 0.647389809315411, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018744965877970696, + "loss": 1.6428, + "step": 2071 + }, + { + "epoch": 0.6477024070021882, + "grad_norm": 0.2890625, + "learning_rate": 0.00018743773863719683, + "loss": 2.3381, + "step": 2072 + }, + { + "epoch": 0.6480150046889653, + "grad_norm": 0.2265625, + "learning_rate": 0.00018742581321595902, + "loss": 1.4568, + "step": 2073 + }, + { + "epoch": 0.6483276023757424, + "grad_norm": 0.220703125, + "learning_rate": 0.00018741388251671345, + "loss": 1.5651, + "step": 2074 + }, + { + "epoch": 0.6486402000625195, + "grad_norm": 0.2421875, + "learning_rate": 0.0001874019465401804, + "loss": 1.8459, + "step": 2075 + }, + { + "epoch": 0.6489527977492967, + "grad_norm": 0.2265625, + "learning_rate": 0.00018739000528708046, + "loss": 1.6691, + "step": 2076 + }, + { + "epoch": 0.6492653954360738, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018737805875813454, + "loss": 1.8378, + "step": 2077 + }, + { + "epoch": 0.6495779931228509, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018736610695406386, + "loss": 1.8245, + "step": 2078 + }, + { + "epoch": 0.649890590809628, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018735414987559, + "loss": 1.7107, + "step": 2079 + }, + { + "epoch": 0.6502031884964051, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018734218752343478, + "loss": 1.7694, + "step": 2080 + }, + { + "epoch": 0.6505157861831823, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018733021989832035, + "loss": 1.7134, + "step": 2081 + }, + { + "epoch": 0.6508283838699593, + "grad_norm": 0.216796875, + "learning_rate": 0.00018731824700096933, + "loss": 1.8064, + "step": 2082 + }, + { + "epoch": 0.6511409815567365, + "grad_norm": 0.23828125, + "learning_rate": 0.00018730626883210443, + "loss": 1.694, + "step": 2083 + }, + { + "epoch": 0.6514535792435135, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018729428539244884, + "loss": 1.7573, + "step": 2084 + }, + { + "epoch": 0.6517661769302907, + "grad_norm": 0.228515625, + "learning_rate": 0.00018728229668272598, + "loss": 1.6263, + "step": 2085 + }, + { + "epoch": 0.6520787746170679, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018727030270365965, + "loss": 1.846, + "step": 2086 + }, + { + "epoch": 0.6523913723038449, + "grad_norm": 0.244140625, + "learning_rate": 0.00018725830345597396, + "loss": 1.7912, + "step": 2087 + }, + { + "epoch": 0.6527039699906221, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001872462989403933, + "loss": 1.777, + "step": 2088 + }, + { + "epoch": 0.6530165676773992, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018723428915764237, + "loss": 1.675, + "step": 2089 + }, + { + "epoch": 0.6533291653641763, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018722227410844625, + "loss": 1.5869, + "step": 2090 + }, + { + "epoch": 0.6536417630509535, + "grad_norm": 0.244140625, + "learning_rate": 0.00018721025379353026, + "loss": 1.8295, + "step": 2091 + }, + { + "epoch": 0.6539543607377305, + "grad_norm": 0.23046875, + "learning_rate": 0.00018719822821362017, + "loss": 1.6437, + "step": 2092 + }, + { + "epoch": 0.6542669584245077, + "grad_norm": 0.2421875, + "learning_rate": 0.0001871861973694419, + "loss": 1.8373, + "step": 2093 + }, + { + "epoch": 0.6545795561112848, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018717416126172177, + "loss": 1.3641, + "step": 2094 + }, + { + "epoch": 0.6548921537980619, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018716211989118646, + "loss": 1.7446, + "step": 2095 + }, + { + "epoch": 0.655204751484839, + "grad_norm": 0.234375, + "learning_rate": 0.00018715007325856292, + "loss": 1.7373, + "step": 2096 + }, + { + "epoch": 0.6555173491716161, + "grad_norm": 0.23828125, + "learning_rate": 0.00018713802136457837, + "loss": 1.6263, + "step": 2097 + }, + { + "epoch": 0.6558299468583932, + "grad_norm": 0.23046875, + "learning_rate": 0.00018712596420996045, + "loss": 1.7508, + "step": 2098 + }, + { + "epoch": 0.6561425445451704, + "grad_norm": 0.232421875, + "learning_rate": 0.00018711390179543703, + "loss": 1.8481, + "step": 2099 + }, + { + "epoch": 0.6564551422319475, + "grad_norm": 0.232421875, + "learning_rate": 0.00018710183412173635, + "loss": 1.7739, + "step": 2100 + }, + { + "epoch": 0.6567677399187246, + "grad_norm": 0.2265625, + "learning_rate": 0.00018708976118958693, + "loss": 1.989, + "step": 2101 + }, + { + "epoch": 0.6570803376055018, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001870776829997177, + "loss": 1.8054, + "step": 2102 + }, + { + "epoch": 0.6573929352922788, + "grad_norm": 0.2265625, + "learning_rate": 0.00018706559955285773, + "loss": 1.665, + "step": 2103 + }, + { + "epoch": 0.657705532979056, + "grad_norm": 0.22265625, + "learning_rate": 0.0001870535108497366, + "loss": 1.703, + "step": 2104 + }, + { + "epoch": 0.658018130665833, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001870414168910841, + "loss": 1.7818, + "step": 2105 + }, + { + "epoch": 0.6583307283526102, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018702931767763028, + "loss": 1.5893, + "step": 2106 + }, + { + "epoch": 0.6586433260393874, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001870172132101057, + "loss": 1.6743, + "step": 2107 + }, + { + "epoch": 0.6589559237261644, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018700510348924106, + "loss": 1.5062, + "step": 2108 + }, + { + "epoch": 0.6592685214129416, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018699298851576743, + "loss": 1.4517, + "step": 2109 + }, + { + "epoch": 0.6595811190997186, + "grad_norm": 0.23828125, + "learning_rate": 0.00018698086829041627, + "loss": 1.7555, + "step": 2110 + }, + { + "epoch": 0.6598937167864958, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001869687428139192, + "loss": 1.7701, + "step": 2111 + }, + { + "epoch": 0.6602063144732729, + "grad_norm": 0.228515625, + "learning_rate": 0.00018695661208700836, + "loss": 1.5693, + "step": 2112 + }, + { + "epoch": 0.66051891216005, + "grad_norm": 0.2265625, + "learning_rate": 0.000186944476110416, + "loss": 1.473, + "step": 2113 + }, + { + "epoch": 0.6608315098468271, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018693233488487483, + "loss": 1.4396, + "step": 2114 + }, + { + "epoch": 0.6611441075336042, + "grad_norm": 0.236328125, + "learning_rate": 0.00018692018841111782, + "loss": 1.9964, + "step": 2115 + }, + { + "epoch": 0.6614567052203814, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018690803668987827, + "loss": 1.6639, + "step": 2116 + }, + { + "epoch": 0.6617693029071585, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001868958797218898, + "loss": 1.7607, + "step": 2117 + }, + { + "epoch": 0.6620819005939356, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018688371750788635, + "loss": 1.5137, + "step": 2118 + }, + { + "epoch": 0.6623944982807127, + "grad_norm": 0.21875, + "learning_rate": 0.00018687155004860215, + "loss": 1.5756, + "step": 2119 + }, + { + "epoch": 0.6627070959674899, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018685937734477177, + "loss": 1.7926, + "step": 2120 + }, + { + "epoch": 0.6630196936542669, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001868471993971301, + "loss": 1.7269, + "step": 2121 + }, + { + "epoch": 0.6633322913410441, + "grad_norm": 0.232421875, + "learning_rate": 0.0001868350162064123, + "loss": 1.6515, + "step": 2122 + }, + { + "epoch": 0.6636448890278211, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018682282777335397, + "loss": 1.5462, + "step": 2123 + }, + { + "epoch": 0.6639574867145983, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018681063409869085, + "loss": 1.7719, + "step": 2124 + }, + { + "epoch": 0.6642700844013755, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018679843518315913, + "loss": 1.9495, + "step": 2125 + }, + { + "epoch": 0.6645826820881525, + "grad_norm": 0.232421875, + "learning_rate": 0.0001867862310274953, + "loss": 1.5323, + "step": 2126 + }, + { + "epoch": 0.6648952797749297, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018677402163243606, + "loss": 1.5997, + "step": 2127 + }, + { + "epoch": 0.6652078774617067, + "grad_norm": 0.2265625, + "learning_rate": 0.0001867618069987186, + "loss": 1.891, + "step": 2128 + }, + { + "epoch": 0.6655204751484839, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018674958712708027, + "loss": 1.7805, + "step": 2129 + }, + { + "epoch": 0.665833072835261, + "grad_norm": 0.228515625, + "learning_rate": 0.00018673736201825882, + "loss": 1.7896, + "step": 2130 + }, + { + "epoch": 0.6661456705220381, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001867251316729923, + "loss": 1.8483, + "step": 2131 + }, + { + "epoch": 0.6664582682088153, + "grad_norm": 0.234375, + "learning_rate": 0.00018671289609201907, + "loss": 1.8642, + "step": 2132 + }, + { + "epoch": 0.6667708658955924, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001867006552760778, + "loss": 1.4944, + "step": 2133 + }, + { + "epoch": 0.6670834635823695, + "grad_norm": 0.2265625, + "learning_rate": 0.00018668840922590746, + "loss": 1.4096, + "step": 2134 + }, + { + "epoch": 0.6673960612691466, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018667615794224743, + "loss": 1.8447, + "step": 2135 + }, + { + "epoch": 0.6677086589559237, + "grad_norm": 0.228515625, + "learning_rate": 0.00018666390142583724, + "loss": 1.7672, + "step": 2136 + }, + { + "epoch": 0.6680212566427008, + "grad_norm": 0.224609375, + "learning_rate": 0.00018665163967741694, + "loss": 1.4677, + "step": 2137 + }, + { + "epoch": 0.668333854329478, + "grad_norm": 0.248046875, + "learning_rate": 0.0001866393726977267, + "loss": 1.9113, + "step": 2138 + }, + { + "epoch": 0.668646452016255, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018662710048750712, + "loss": 1.6074, + "step": 2139 + }, + { + "epoch": 0.6689590497030322, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018661482304749915, + "loss": 1.9865, + "step": 2140 + }, + { + "epoch": 0.6692716473898093, + "grad_norm": 0.234375, + "learning_rate": 0.00018660254037844388, + "loss": 1.5433, + "step": 2141 + }, + { + "epoch": 0.6695842450765864, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018659025248108288, + "loss": 1.7213, + "step": 2142 + }, + { + "epoch": 0.6698968427633636, + "grad_norm": 0.220703125, + "learning_rate": 0.00018657795935615802, + "loss": 1.7668, + "step": 2143 + }, + { + "epoch": 0.6702094404501406, + "grad_norm": 0.240234375, + "learning_rate": 0.00018656566100441144, + "loss": 1.7344, + "step": 2144 + }, + { + "epoch": 0.6705220381369178, + "grad_norm": 0.21875, + "learning_rate": 0.00018655335742658556, + "loss": 1.6451, + "step": 2145 + }, + { + "epoch": 0.670834635823695, + "grad_norm": 0.224609375, + "learning_rate": 0.00018654104862342324, + "loss": 1.6888, + "step": 2146 + }, + { + "epoch": 0.671147233510472, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018652873459566749, + "loss": 1.426, + "step": 2147 + }, + { + "epoch": 0.6714598311972492, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018651641534406178, + "loss": 1.6177, + "step": 2148 + }, + { + "epoch": 0.6717724288840262, + "grad_norm": 0.240234375, + "learning_rate": 0.00018650409086934985, + "loss": 1.6962, + "step": 2149 + }, + { + "epoch": 0.6720850265708034, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001864917611722757, + "loss": 1.6879, + "step": 2150 + }, + { + "epoch": 0.6723976242575805, + "grad_norm": 0.22265625, + "learning_rate": 0.0001864794262535837, + "loss": 1.9992, + "step": 2151 + }, + { + "epoch": 0.6727102219443576, + "grad_norm": 0.23046875, + "learning_rate": 0.0001864670861140186, + "loss": 1.9401, + "step": 2152 + }, + { + "epoch": 0.6730228196311347, + "grad_norm": 0.2421875, + "learning_rate": 0.00018645474075432524, + "loss": 1.8057, + "step": 2153 + }, + { + "epoch": 0.6733354173179118, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018644239017524906, + "loss": 2.0631, + "step": 2154 + }, + { + "epoch": 0.673648015004689, + "grad_norm": 0.234375, + "learning_rate": 0.00018643003437753558, + "loss": 1.6794, + "step": 2155 + }, + { + "epoch": 0.6739606126914661, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018641767336193086, + "loss": 1.7738, + "step": 2156 + }, + { + "epoch": 0.6742732103782432, + "grad_norm": 0.240234375, + "learning_rate": 0.000186405307129181, + "loss": 1.8517, + "step": 2157 + }, + { + "epoch": 0.6745858080650203, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018639293568003268, + "loss": 1.5776, + "step": 2158 + }, + { + "epoch": 0.6748984057517975, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018638055901523277, + "loss": 1.6955, + "step": 2159 + }, + { + "epoch": 0.6752110034385745, + "grad_norm": 0.23828125, + "learning_rate": 0.00018636817713552837, + "loss": 1.6111, + "step": 2160 + }, + { + "epoch": 0.6755236011253517, + "grad_norm": 0.2421875, + "learning_rate": 0.00018635579004166712, + "loss": 1.8155, + "step": 2161 + }, + { + "epoch": 0.6758361988121288, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018634339773439674, + "loss": 1.6656, + "step": 2162 + }, + { + "epoch": 0.6761487964989059, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001863310002144654, + "loss": 1.5922, + "step": 2163 + }, + { + "epoch": 0.6764613941856831, + "grad_norm": 0.220703125, + "learning_rate": 0.0001863185974826216, + "loss": 1.7238, + "step": 2164 + }, + { + "epoch": 0.6767739918724601, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018630618953961408, + "loss": 1.6582, + "step": 2165 + }, + { + "epoch": 0.6770865895592373, + "grad_norm": 0.283203125, + "learning_rate": 0.0001862937763861919, + "loss": 2.3931, + "step": 2166 + }, + { + "epoch": 0.6773991872460143, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018628135802310446, + "loss": 1.7434, + "step": 2167 + }, + { + "epoch": 0.6777117849327915, + "grad_norm": 0.251953125, + "learning_rate": 0.0001862689344511015, + "loss": 2.0366, + "step": 2168 + }, + { + "epoch": 0.6780243826195687, + "grad_norm": 0.232421875, + "learning_rate": 0.000186256505670933, + "loss": 1.6197, + "step": 2169 + }, + { + "epoch": 0.6783369803063457, + "grad_norm": 0.22265625, + "learning_rate": 0.0001862440716833494, + "loss": 1.5561, + "step": 2170 + }, + { + "epoch": 0.6786495779931229, + "grad_norm": 0.248046875, + "learning_rate": 0.00018623163248910127, + "loss": 1.8304, + "step": 2171 + }, + { + "epoch": 0.6789621756798999, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018621918808893958, + "loss": 1.3873, + "step": 2172 + }, + { + "epoch": 0.6792747733666771, + "grad_norm": 0.23046875, + "learning_rate": 0.00018620673848361566, + "loss": 1.4493, + "step": 2173 + }, + { + "epoch": 0.6795873710534542, + "grad_norm": 0.25390625, + "learning_rate": 0.00018619428367388103, + "loss": 1.7057, + "step": 2174 + }, + { + "epoch": 0.6798999687402313, + "grad_norm": 0.232421875, + "learning_rate": 0.0001861818236604877, + "loss": 1.5443, + "step": 2175 + }, + { + "epoch": 0.6802125664270084, + "grad_norm": 0.228515625, + "learning_rate": 0.00018616935844418785, + "loss": 1.651, + "step": 2176 + }, + { + "epoch": 0.6805251641137856, + "grad_norm": 0.2470703125, + "learning_rate": 0.000186156888025734, + "loss": 1.7987, + "step": 2177 + }, + { + "epoch": 0.6808377618005627, + "grad_norm": 0.234375, + "learning_rate": 0.00018614441240587907, + "loss": 1.8154, + "step": 2178 + }, + { + "epoch": 0.6811503594873398, + "grad_norm": 0.232421875, + "learning_rate": 0.0001861319315853762, + "loss": 1.7168, + "step": 2179 + }, + { + "epoch": 0.6814629571741169, + "grad_norm": 0.234375, + "learning_rate": 0.0001861194455649788, + "loss": 1.4816, + "step": 2180 + }, + { + "epoch": 0.681775554860894, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018610695434544074, + "loss": 1.5243, + "step": 2181 + }, + { + "epoch": 0.6820881525476712, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018609445792751618, + "loss": 1.7344, + "step": 2182 + }, + { + "epoch": 0.6824007502344482, + "grad_norm": 0.228515625, + "learning_rate": 0.00018608195631195939, + "loss": 1.8136, + "step": 2183 + }, + { + "epoch": 0.6827133479212254, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018606944949952524, + "loss": 1.7538, + "step": 2184 + }, + { + "epoch": 0.6830259456080024, + "grad_norm": 0.236328125, + "learning_rate": 0.00018605693749096876, + "loss": 1.8747, + "step": 2185 + }, + { + "epoch": 0.6833385432947796, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018604442028704533, + "loss": 1.6926, + "step": 2186 + }, + { + "epoch": 0.6836511409815568, + "grad_norm": 0.228515625, + "learning_rate": 0.00018603189788851055, + "loss": 1.7869, + "step": 2187 + }, + { + "epoch": 0.6839637386683338, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018601937029612048, + "loss": 1.6719, + "step": 2188 + }, + { + "epoch": 0.684276336355111, + "grad_norm": 0.23828125, + "learning_rate": 0.0001860068375106314, + "loss": 1.7719, + "step": 2189 + }, + { + "epoch": 0.6845889340418881, + "grad_norm": 0.2265625, + "learning_rate": 0.00018599429953279994, + "loss": 1.618, + "step": 2190 + }, + { + "epoch": 0.6849015317286652, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018598175636338305, + "loss": 1.7768, + "step": 2191 + }, + { + "epoch": 0.6852141294154424, + "grad_norm": 0.234375, + "learning_rate": 0.00018596920800313798, + "loss": 1.9978, + "step": 2192 + }, + { + "epoch": 0.6855267271022194, + "grad_norm": 0.22265625, + "learning_rate": 0.0001859566544528222, + "loss": 1.3867, + "step": 2193 + }, + { + "epoch": 0.6858393247889966, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001859440957131937, + "loss": 1.5844, + "step": 2194 + }, + { + "epoch": 0.6861519224757737, + "grad_norm": 0.234375, + "learning_rate": 0.00018593153178501063, + "loss": 1.7227, + "step": 2195 + }, + { + "epoch": 0.6864645201625508, + "grad_norm": 0.25390625, + "learning_rate": 0.0001859189626690315, + "loss": 1.8812, + "step": 2196 + }, + { + "epoch": 0.6867771178493279, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018590638836601505, + "loss": 1.5477, + "step": 2197 + }, + { + "epoch": 0.687089715536105, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001858938088767205, + "loss": 1.8684, + "step": 2198 + }, + { + "epoch": 0.6874023132228821, + "grad_norm": 0.236328125, + "learning_rate": 0.00018588122420190722, + "loss": 1.8864, + "step": 2199 + }, + { + "epoch": 0.6877149109096593, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018586863434233504, + "loss": 1.7888, + "step": 2200 + }, + { + "epoch": 0.6880275085964364, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018585603929876395, + "loss": 1.6452, + "step": 2201 + }, + { + "epoch": 0.6883401062832135, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018584343907195437, + "loss": 1.585, + "step": 2202 + }, + { + "epoch": 0.6886527039699907, + "grad_norm": 0.23828125, + "learning_rate": 0.000185830833662667, + "loss": 1.7144, + "step": 2203 + }, + { + "epoch": 0.6889653016567677, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018581822307166281, + "loss": 1.7379, + "step": 2204 + }, + { + "epoch": 0.6892778993435449, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018580560729970313, + "loss": 1.777, + "step": 2205 + }, + { + "epoch": 0.6895904970303219, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018579298634754962, + "loss": 1.902, + "step": 2206 + }, + { + "epoch": 0.6899030947170991, + "grad_norm": 0.220703125, + "learning_rate": 0.00018578036021596415, + "loss": 1.6602, + "step": 2207 + }, + { + "epoch": 0.6902156924038763, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018576772890570905, + "loss": 1.8837, + "step": 2208 + }, + { + "epoch": 0.6905282900906533, + "grad_norm": 0.251953125, + "learning_rate": 0.00018575509241754685, + "loss": 1.6694, + "step": 2209 + }, + { + "epoch": 0.6908408877774305, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018574245075224046, + "loss": 1.7201, + "step": 2210 + }, + { + "epoch": 0.6911534854642075, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018572980391055305, + "loss": 1.4998, + "step": 2211 + }, + { + "epoch": 0.6914660831509847, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018571715189324813, + "loss": 1.4607, + "step": 2212 + }, + { + "epoch": 0.6917786808377618, + "grad_norm": 0.2158203125, + "learning_rate": 0.00018570449470108952, + "loss": 1.8028, + "step": 2213 + }, + { + "epoch": 0.6920912785245389, + "grad_norm": 0.234375, + "learning_rate": 0.00018569183233484133, + "loss": 1.5558, + "step": 2214 + }, + { + "epoch": 0.692403876211316, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018567916479526804, + "loss": 1.5834, + "step": 2215 + }, + { + "epoch": 0.6927164738980931, + "grad_norm": 0.232421875, + "learning_rate": 0.0001856664920831344, + "loss": 1.6607, + "step": 2216 + }, + { + "epoch": 0.6930290715848703, + "grad_norm": 0.236328125, + "learning_rate": 0.00018565381419920546, + "loss": 1.5378, + "step": 2217 + }, + { + "epoch": 0.6933416692716474, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018564113114424662, + "loss": 1.8949, + "step": 2218 + }, + { + "epoch": 0.6936542669584245, + "grad_norm": 0.234375, + "learning_rate": 0.00018562844291902353, + "loss": 1.9261, + "step": 2219 + }, + { + "epoch": 0.6939668646452016, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018561574952430222, + "loss": 2.0413, + "step": 2220 + }, + { + "epoch": 0.6942794623319788, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018560305096084904, + "loss": 1.7628, + "step": 2221 + }, + { + "epoch": 0.6945920600187558, + "grad_norm": 0.240234375, + "learning_rate": 0.00018559034722943056, + "loss": 1.6226, + "step": 2222 + }, + { + "epoch": 0.694904657705533, + "grad_norm": 0.22265625, + "learning_rate": 0.00018557763833081377, + "loss": 1.8693, + "step": 2223 + }, + { + "epoch": 0.69521725539231, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001855649242657659, + "loss": 1.4996, + "step": 2224 + }, + { + "epoch": 0.6955298530790872, + "grad_norm": 0.25390625, + "learning_rate": 0.00018555220503505452, + "loss": 2.2346, + "step": 2225 + }, + { + "epoch": 0.6958424507658644, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018553948063944749, + "loss": 1.773, + "step": 2226 + }, + { + "epoch": 0.6961550484526414, + "grad_norm": 0.2373046875, + "learning_rate": 0.000185526751079713, + "loss": 1.8362, + "step": 2227 + }, + { + "epoch": 0.6964676461394186, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018551401635661958, + "loss": 1.6007, + "step": 2228 + }, + { + "epoch": 0.6967802438261956, + "grad_norm": 0.234375, + "learning_rate": 0.00018550127647093601, + "loss": 1.5875, + "step": 2229 + }, + { + "epoch": 0.6970928415129728, + "grad_norm": 0.228515625, + "learning_rate": 0.00018548853142343142, + "loss": 1.7156, + "step": 2230 + }, + { + "epoch": 0.69740543919975, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018547578121487528, + "loss": 1.784, + "step": 2231 + }, + { + "epoch": 0.697718036886527, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018546302584603727, + "loss": 1.6756, + "step": 2232 + }, + { + "epoch": 0.6980306345733042, + "grad_norm": 0.25, + "learning_rate": 0.0001854502653176875, + "loss": 1.8622, + "step": 2233 + }, + { + "epoch": 0.6983432322600813, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001854374996305963, + "loss": 1.383, + "step": 2234 + }, + { + "epoch": 0.6986558299468584, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001854247287855344, + "loss": 1.516, + "step": 2235 + }, + { + "epoch": 0.6989684276336355, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018541195278327276, + "loss": 1.5284, + "step": 2236 + }, + { + "epoch": 0.6992810253204126, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001853991716245827, + "loss": 1.4208, + "step": 2237 + }, + { + "epoch": 0.6995936230071897, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001853863853102358, + "loss": 1.8169, + "step": 2238 + }, + { + "epoch": 0.6999062206939669, + "grad_norm": 0.23046875, + "learning_rate": 0.000185373593841004, + "loss": 1.686, + "step": 2239 + }, + { + "epoch": 0.700218818380744, + "grad_norm": 0.234375, + "learning_rate": 0.00018536079721765956, + "loss": 1.4067, + "step": 2240 + }, + { + "epoch": 0.7005314160675211, + "grad_norm": 0.228515625, + "learning_rate": 0.00018534799544097505, + "loss": 1.7239, + "step": 2241 + }, + { + "epoch": 0.7008440137542982, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018533518851172325, + "loss": 1.6176, + "step": 2242 + }, + { + "epoch": 0.7011566114410753, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001853223764306774, + "loss": 1.6086, + "step": 2243 + }, + { + "epoch": 0.7014692091278525, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018530955919861096, + "loss": 1.5131, + "step": 2244 + }, + { + "epoch": 0.7017818068146295, + "grad_norm": 0.224609375, + "learning_rate": 0.0001852967368162977, + "loss": 1.685, + "step": 2245 + }, + { + "epoch": 0.7020944045014067, + "grad_norm": 0.232421875, + "learning_rate": 0.00018528390928451173, + "loss": 1.8137, + "step": 2246 + }, + { + "epoch": 0.7024070021881839, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018527107660402752, + "loss": 1.7175, + "step": 2247 + }, + { + "epoch": 0.7027195998749609, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018525823877561974, + "loss": 1.6921, + "step": 2248 + }, + { + "epoch": 0.7030321975617381, + "grad_norm": 0.23046875, + "learning_rate": 0.0001852453958000634, + "loss": 1.9215, + "step": 2249 + }, + { + "epoch": 0.7033447952485151, + "grad_norm": 0.228515625, + "learning_rate": 0.00018523254767813393, + "loss": 1.5655, + "step": 2250 + }, + { + "epoch": 0.7036573929352923, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018521969441060695, + "loss": 1.6418, + "step": 2251 + }, + { + "epoch": 0.7039699906220694, + "grad_norm": 0.240234375, + "learning_rate": 0.0001852068359982584, + "loss": 1.8771, + "step": 2252 + }, + { + "epoch": 0.7042825883088465, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018519397244186458, + "loss": 1.7217, + "step": 2253 + }, + { + "epoch": 0.7045951859956237, + "grad_norm": 0.228515625, + "learning_rate": 0.0001851811037422021, + "loss": 1.8586, + "step": 2254 + }, + { + "epoch": 0.7049077836824007, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018516822990004782, + "loss": 1.5904, + "step": 2255 + }, + { + "epoch": 0.7052203813691779, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018515535091617898, + "loss": 1.6428, + "step": 2256 + }, + { + "epoch": 0.705532979055955, + "grad_norm": 0.234375, + "learning_rate": 0.0001851424667913731, + "loss": 1.7164, + "step": 2257 + }, + { + "epoch": 0.7058455767427321, + "grad_norm": 0.23046875, + "learning_rate": 0.00018512957752640799, + "loss": 1.7193, + "step": 2258 + }, + { + "epoch": 0.7061581744295092, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018511668312206177, + "loss": 1.5025, + "step": 2259 + }, + { + "epoch": 0.7064707721162864, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018510378357911296, + "loss": 1.612, + "step": 2260 + }, + { + "epoch": 0.7067833698030634, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018509087889834031, + "loss": 1.5849, + "step": 2261 + }, + { + "epoch": 0.7070959674898406, + "grad_norm": 0.25, + "learning_rate": 0.00018507796908052285, + "loss": 1.6807, + "step": 2262 + }, + { + "epoch": 0.7074085651766177, + "grad_norm": 0.228515625, + "learning_rate": 0.00018506505412643995, + "loss": 1.6728, + "step": 2263 + }, + { + "epoch": 0.7077211628633948, + "grad_norm": 0.234375, + "learning_rate": 0.00018505213403687137, + "loss": 1.7322, + "step": 2264 + }, + { + "epoch": 0.708033760550172, + "grad_norm": 0.2265625, + "learning_rate": 0.00018503920881259703, + "loss": 1.6204, + "step": 2265 + }, + { + "epoch": 0.708346358236949, + "grad_norm": 0.228515625, + "learning_rate": 0.00018502627845439732, + "loss": 1.5918, + "step": 2266 + }, + { + "epoch": 0.7086589559237262, + "grad_norm": 0.2421875, + "learning_rate": 0.00018501334296305285, + "loss": 1.8249, + "step": 2267 + }, + { + "epoch": 0.7089715536105032, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018500040233934454, + "loss": 1.974, + "step": 2268 + }, + { + "epoch": 0.7092841512972804, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018498745658405356, + "loss": 1.6999, + "step": 2269 + }, + { + "epoch": 0.7095967489840576, + "grad_norm": 0.23046875, + "learning_rate": 0.00018497450569796158, + "loss": 1.9307, + "step": 2270 + }, + { + "epoch": 0.7099093466708346, + "grad_norm": 0.240234375, + "learning_rate": 0.00018496154968185036, + "loss": 1.7392, + "step": 2271 + }, + { + "epoch": 0.7102219443576118, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018494858853650213, + "loss": 1.7068, + "step": 2272 + }, + { + "epoch": 0.7105345420443888, + "grad_norm": 0.232421875, + "learning_rate": 0.0001849356222626994, + "loss": 1.8758, + "step": 2273 + }, + { + "epoch": 0.710847139731166, + "grad_norm": 0.236328125, + "learning_rate": 0.00018492265086122488, + "loss": 1.6345, + "step": 2274 + }, + { + "epoch": 0.7111597374179431, + "grad_norm": 0.24609375, + "learning_rate": 0.0001849096743328617, + "loss": 1.7491, + "step": 2275 + }, + { + "epoch": 0.7114723351047202, + "grad_norm": 0.2265625, + "learning_rate": 0.0001848966926783933, + "loss": 1.5166, + "step": 2276 + }, + { + "epoch": 0.7117849327914973, + "grad_norm": 0.234375, + "learning_rate": 0.0001848837058986034, + "loss": 1.7068, + "step": 2277 + }, + { + "epoch": 0.7120975304782745, + "grad_norm": 0.23046875, + "learning_rate": 0.00018487071399427599, + "loss": 1.7652, + "step": 2278 + }, + { + "epoch": 0.7124101281650516, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018485771696619542, + "loss": 1.7871, + "step": 2279 + }, + { + "epoch": 0.7127227258518287, + "grad_norm": 0.228515625, + "learning_rate": 0.00018484471481514635, + "loss": 1.9055, + "step": 2280 + }, + { + "epoch": 0.7130353235386058, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001848317075419137, + "loss": 1.8693, + "step": 2281 + }, + { + "epoch": 0.7133479212253829, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018481869514728279, + "loss": 1.548, + "step": 2282 + }, + { + "epoch": 0.7136605189121601, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018480567763203918, + "loss": 1.614, + "step": 2283 + }, + { + "epoch": 0.7139731165989371, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001847926549969687, + "loss": 1.4828, + "step": 2284 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018477962724285763, + "loss": 1.8229, + "step": 2285 + }, + { + "epoch": 0.7145983119724914, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018476659437049238, + "loss": 1.877, + "step": 2286 + }, + { + "epoch": 0.7149109096592685, + "grad_norm": 0.23828125, + "learning_rate": 0.00018475355638065984, + "loss": 1.5996, + "step": 2287 + }, + { + "epoch": 0.7152235073460457, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018474051327414709, + "loss": 1.6033, + "step": 2288 + }, + { + "epoch": 0.7155361050328227, + "grad_norm": 0.248046875, + "learning_rate": 0.00018472746505174156, + "loss": 1.6509, + "step": 2289 + }, + { + "epoch": 0.7158487027195999, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018471441171423103, + "loss": 1.8609, + "step": 2290 + }, + { + "epoch": 0.716161300406377, + "grad_norm": 0.224609375, + "learning_rate": 0.00018470135326240347, + "loss": 1.8864, + "step": 2291 + }, + { + "epoch": 0.7164738980931541, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001846882896970473, + "loss": 1.5743, + "step": 2292 + }, + { + "epoch": 0.7167864957799313, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018467522101895116, + "loss": 1.8124, + "step": 2293 + }, + { + "epoch": 0.7170990934667083, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018466214722890402, + "loss": 1.4247, + "step": 2294 + }, + { + "epoch": 0.7174116911534855, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018464906832769517, + "loss": 1.5627, + "step": 2295 + }, + { + "epoch": 0.7177242888402626, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001846359843161142, + "loss": 1.8247, + "step": 2296 + }, + { + "epoch": 0.7180368865270397, + "grad_norm": 0.234375, + "learning_rate": 0.000184622895194951, + "loss": 1.6003, + "step": 2297 + }, + { + "epoch": 0.7183494842138168, + "grad_norm": 0.236328125, + "learning_rate": 0.0001846098009649958, + "loss": 1.6546, + "step": 2298 + }, + { + "epoch": 0.7186620819005939, + "grad_norm": 0.23046875, + "learning_rate": 0.00018459670162703905, + "loss": 1.8521, + "step": 2299 + }, + { + "epoch": 0.718974679587371, + "grad_norm": 0.2421875, + "learning_rate": 0.00018458359718187165, + "loss": 1.7397, + "step": 2300 + }, + { + "epoch": 0.7192872772741482, + "grad_norm": 0.232421875, + "learning_rate": 0.0001845704876302847, + "loss": 1.7336, + "step": 2301 + }, + { + "epoch": 0.7195998749609253, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018455737297306963, + "loss": 1.6112, + "step": 2302 + }, + { + "epoch": 0.7199124726477024, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018454425321101826, + "loss": 1.8522, + "step": 2303 + }, + { + "epoch": 0.7202250703344796, + "grad_norm": 0.236328125, + "learning_rate": 0.0001845311283449225, + "loss": 1.6348, + "step": 2304 + }, + { + "epoch": 0.7205376680212566, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018451799837557485, + "loss": 1.7101, + "step": 2305 + }, + { + "epoch": 0.7208502657080338, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018450486330376793, + "loss": 1.4738, + "step": 2306 + }, + { + "epoch": 0.7211628633948108, + "grad_norm": 0.23828125, + "learning_rate": 0.00018449172313029472, + "loss": 1.6334, + "step": 2307 + }, + { + "epoch": 0.721475461081588, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018447857785594852, + "loss": 1.5218, + "step": 2308 + }, + { + "epoch": 0.7217880587683652, + "grad_norm": 0.2421875, + "learning_rate": 0.00018446542748152292, + "loss": 1.8324, + "step": 2309 + }, + { + "epoch": 0.7221006564551422, + "grad_norm": 0.234375, + "learning_rate": 0.00018445227200781185, + "loss": 1.8051, + "step": 2310 + }, + { + "epoch": 0.7224132541419194, + "grad_norm": 0.228515625, + "learning_rate": 0.0001844391114356095, + "loss": 1.65, + "step": 2311 + }, + { + "epoch": 0.7227258518286964, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018442594576571035, + "loss": 1.8499, + "step": 2312 + }, + { + "epoch": 0.7230384495154736, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001844127749989093, + "loss": 1.671, + "step": 2313 + }, + { + "epoch": 0.7233510472022507, + "grad_norm": 0.22265625, + "learning_rate": 0.0001843995991360014, + "loss": 1.7405, + "step": 2314 + }, + { + "epoch": 0.7236636448890278, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001843864181777822, + "loss": 1.8025, + "step": 2315 + }, + { + "epoch": 0.723976242575805, + "grad_norm": 0.244140625, + "learning_rate": 0.00018437323212504742, + "loss": 1.5695, + "step": 2316 + }, + { + "epoch": 0.7242888402625821, + "grad_norm": 0.22265625, + "learning_rate": 0.00018436004097859308, + "loss": 1.2384, + "step": 2317 + }, + { + "epoch": 0.7246014379493592, + "grad_norm": 0.2265625, + "learning_rate": 0.00018434684473921556, + "loss": 1.6555, + "step": 2318 + }, + { + "epoch": 0.7249140356361363, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018433364340771153, + "loss": 1.6447, + "step": 2319 + }, + { + "epoch": 0.7252266333229134, + "grad_norm": 0.232421875, + "learning_rate": 0.00018432043698487797, + "loss": 1.6859, + "step": 2320 + }, + { + "epoch": 0.7255392310096905, + "grad_norm": 0.232421875, + "learning_rate": 0.0001843072254715122, + "loss": 1.7087, + "step": 2321 + }, + { + "epoch": 0.7258518286964677, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001842940088684118, + "loss": 1.7149, + "step": 2322 + }, + { + "epoch": 0.7261644263832447, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018428078717637467, + "loss": 1.8408, + "step": 2323 + }, + { + "epoch": 0.7264770240700219, + "grad_norm": 0.2294921875, + "learning_rate": 0.000184267560396199, + "loss": 1.7943, + "step": 2324 + }, + { + "epoch": 0.726789621756799, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018425432852868333, + "loss": 1.7252, + "step": 2325 + }, + { + "epoch": 0.7271022194435761, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001842410915746265, + "loss": 1.6914, + "step": 2326 + }, + { + "epoch": 0.7274148171303533, + "grad_norm": 0.232421875, + "learning_rate": 0.0001842278495348276, + "loss": 1.9011, + "step": 2327 + }, + { + "epoch": 0.7277274148171303, + "grad_norm": 0.236328125, + "learning_rate": 0.00018421460241008607, + "loss": 1.8245, + "step": 2328 + }, + { + "epoch": 0.7280400125039075, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018420135020120172, + "loss": 1.8638, + "step": 2329 + }, + { + "epoch": 0.7283526101906845, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018418809290897455, + "loss": 1.7493, + "step": 2330 + }, + { + "epoch": 0.7286652078774617, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001841748305342049, + "loss": 1.5843, + "step": 2331 + }, + { + "epoch": 0.7289778055642389, + "grad_norm": 0.236328125, + "learning_rate": 0.0001841615630776935, + "loss": 1.5289, + "step": 2332 + }, + { + "epoch": 0.7292904032510159, + "grad_norm": 0.240234375, + "learning_rate": 0.00018414829054024128, + "loss": 1.6851, + "step": 2333 + }, + { + "epoch": 0.7296030009377931, + "grad_norm": 0.21875, + "learning_rate": 0.0001841350129226495, + "loss": 1.3236, + "step": 2334 + }, + { + "epoch": 0.7299155986245702, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018412173022571982, + "loss": 1.9465, + "step": 2335 + }, + { + "epoch": 0.7302281963113473, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018410844245025408, + "loss": 1.7362, + "step": 2336 + }, + { + "epoch": 0.7305407939981244, + "grad_norm": 0.228515625, + "learning_rate": 0.00018409514959705448, + "loss": 1.7688, + "step": 2337 + }, + { + "epoch": 0.7308533916849015, + "grad_norm": 0.24609375, + "learning_rate": 0.0001840818516669235, + "loss": 1.658, + "step": 2338 + }, + { + "epoch": 0.7311659893716786, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018406854866066403, + "loss": 1.6786, + "step": 2339 + }, + { + "epoch": 0.7314785870584558, + "grad_norm": 0.23828125, + "learning_rate": 0.00018405524057907915, + "loss": 1.6658, + "step": 2340 + }, + { + "epoch": 0.7317911847452329, + "grad_norm": 0.2578125, + "learning_rate": 0.0001840419274229723, + "loss": 1.6022, + "step": 2341 + }, + { + "epoch": 0.73210378243201, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018402860919314713, + "loss": 1.7735, + "step": 2342 + }, + { + "epoch": 0.7324163801187871, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001840152858904078, + "loss": 1.4977, + "step": 2343 + }, + { + "epoch": 0.7327289778055642, + "grad_norm": 0.236328125, + "learning_rate": 0.00018400195751555858, + "loss": 1.7735, + "step": 2344 + }, + { + "epoch": 0.7330415754923414, + "grad_norm": 0.2421875, + "learning_rate": 0.00018398862406940412, + "loss": 1.5705, + "step": 2345 + }, + { + "epoch": 0.7333541731791184, + "grad_norm": 0.228515625, + "learning_rate": 0.00018397528555274943, + "loss": 1.9914, + "step": 2346 + }, + { + "epoch": 0.7336667708658956, + "grad_norm": 0.224609375, + "learning_rate": 0.00018396194196639972, + "loss": 1.6567, + "step": 2347 + }, + { + "epoch": 0.7339793685526728, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001839485933111606, + "loss": 1.5779, + "step": 2348 + }, + { + "epoch": 0.7342919662394498, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018393523958783788, + "loss": 1.6902, + "step": 2349 + }, + { + "epoch": 0.734604563926227, + "grad_norm": 0.23046875, + "learning_rate": 0.00018392188079723786, + "loss": 1.8415, + "step": 2350 + }, + { + "epoch": 0.734917161613004, + "grad_norm": 0.2421875, + "learning_rate": 0.0001839085169401669, + "loss": 1.7724, + "step": 2351 + }, + { + "epoch": 0.7352297592997812, + "grad_norm": 0.23046875, + "learning_rate": 0.00018389514801743186, + "loss": 1.4619, + "step": 2352 + }, + { + "epoch": 0.7355423569865583, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018388177402983984, + "loss": 1.7035, + "step": 2353 + }, + { + "epoch": 0.7358549546733354, + "grad_norm": 0.23828125, + "learning_rate": 0.00018386839497819821, + "loss": 1.6311, + "step": 2354 + }, + { + "epoch": 0.7361675523601126, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018385501086331472, + "loss": 1.4891, + "step": 2355 + }, + { + "epoch": 0.7364801500468896, + "grad_norm": 0.23046875, + "learning_rate": 0.00018384162168599735, + "loss": 1.7706, + "step": 2356 + }, + { + "epoch": 0.7367927477336668, + "grad_norm": 0.259765625, + "learning_rate": 0.00018382822744705444, + "loss": 1.7342, + "step": 2357 + }, + { + "epoch": 0.7371053454204439, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001838148281472946, + "loss": 1.7338, + "step": 2358 + }, + { + "epoch": 0.737417943107221, + "grad_norm": 0.244140625, + "learning_rate": 0.0001838014237875268, + "loss": 1.6715, + "step": 2359 + }, + { + "epoch": 0.7377305407939981, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018378801436856027, + "loss": 1.8231, + "step": 2360 + }, + { + "epoch": 0.7380431384807753, + "grad_norm": 0.25, + "learning_rate": 0.00018377459989120452, + "loss": 1.6681, + "step": 2361 + }, + { + "epoch": 0.7383557361675523, + "grad_norm": 0.2265625, + "learning_rate": 0.00018376118035626942, + "loss": 1.6599, + "step": 2362 + }, + { + "epoch": 0.7386683338543295, + "grad_norm": 0.265625, + "learning_rate": 0.00018374775576456513, + "loss": 1.8036, + "step": 2363 + }, + { + "epoch": 0.7389809315411066, + "grad_norm": 0.232421875, + "learning_rate": 0.00018373432611690208, + "loss": 1.8082, + "step": 2364 + }, + { + "epoch": 0.7392935292278837, + "grad_norm": 0.2216796875, + "learning_rate": 0.0001837208914140911, + "loss": 1.4781, + "step": 2365 + }, + { + "epoch": 0.7396061269146609, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018370745165694318, + "loss": 1.3993, + "step": 2366 + }, + { + "epoch": 0.7399187246014379, + "grad_norm": 0.23046875, + "learning_rate": 0.00018369400684626976, + "loss": 1.5936, + "step": 2367 + }, + { + "epoch": 0.7402313222882151, + "grad_norm": 0.2265625, + "learning_rate": 0.00018368055698288248, + "loss": 1.4418, + "step": 2368 + }, + { + "epoch": 0.7405439199749921, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018366710206759335, + "loss": 1.5162, + "step": 2369 + }, + { + "epoch": 0.7408565176617693, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018365364210121466, + "loss": 1.9776, + "step": 2370 + }, + { + "epoch": 0.7411691153485465, + "grad_norm": 0.2265625, + "learning_rate": 0.00018364017708455895, + "loss": 1.3729, + "step": 2371 + }, + { + "epoch": 0.7414817130353235, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001836267070184392, + "loss": 1.6031, + "step": 2372 + }, + { + "epoch": 0.7417943107221007, + "grad_norm": 0.251953125, + "learning_rate": 0.0001836132319036686, + "loss": 1.5944, + "step": 2373 + }, + { + "epoch": 0.7421069084088777, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001835997517410606, + "loss": 1.3653, + "step": 2374 + }, + { + "epoch": 0.7424195060956549, + "grad_norm": 0.232421875, + "learning_rate": 0.0001835862665314291, + "loss": 1.7253, + "step": 2375 + }, + { + "epoch": 0.742732103782432, + "grad_norm": 0.234375, + "learning_rate": 0.00018357277627558815, + "loss": 1.326, + "step": 2376 + }, + { + "epoch": 0.7430447014692091, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018355928097435218, + "loss": 1.8161, + "step": 2377 + }, + { + "epoch": 0.7433572991559863, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018354578062853595, + "loss": 1.8656, + "step": 2378 + }, + { + "epoch": 0.7436698968427634, + "grad_norm": 0.23046875, + "learning_rate": 0.0001835322752389545, + "loss": 1.8657, + "step": 2379 + }, + { + "epoch": 0.7439824945295405, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001835187648064231, + "loss": 1.5715, + "step": 2380 + }, + { + "epoch": 0.7442950922163176, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001835052493317575, + "loss": 1.7185, + "step": 2381 + }, + { + "epoch": 0.7446076899030947, + "grad_norm": 0.234375, + "learning_rate": 0.00018349172881577356, + "loss": 1.7779, + "step": 2382 + }, + { + "epoch": 0.7449202875898718, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018347820325928754, + "loss": 1.9479, + "step": 2383 + }, + { + "epoch": 0.745232885276649, + "grad_norm": 0.234375, + "learning_rate": 0.00018346467266311604, + "loss": 1.7667, + "step": 2384 + }, + { + "epoch": 0.745545482963426, + "grad_norm": 0.248046875, + "learning_rate": 0.00018345113702807585, + "loss": 1.4014, + "step": 2385 + }, + { + "epoch": 0.7458580806502032, + "grad_norm": 0.240234375, + "learning_rate": 0.00018343759635498422, + "loss": 1.8576, + "step": 2386 + }, + { + "epoch": 0.7461706783369803, + "grad_norm": 0.23828125, + "learning_rate": 0.00018342405064465856, + "loss": 1.6006, + "step": 2387 + }, + { + "epoch": 0.7464832760237574, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018341049989791666, + "loss": 1.5874, + "step": 2388 + }, + { + "epoch": 0.7467958737105346, + "grad_norm": 0.251953125, + "learning_rate": 0.00018339694411557655, + "loss": 1.6729, + "step": 2389 + }, + { + "epoch": 0.7471084713973116, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018338338329845668, + "loss": 1.5282, + "step": 2390 + }, + { + "epoch": 0.7474210690840888, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018336981744737573, + "loss": 1.5829, + "step": 2391 + }, + { + "epoch": 0.747733666770866, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001833562465631526, + "loss": 1.6278, + "step": 2392 + }, + { + "epoch": 0.748046264457643, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018334267064660668, + "loss": 1.6944, + "step": 2393 + }, + { + "epoch": 0.7483588621444202, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018332908969855753, + "loss": 1.8641, + "step": 2394 + }, + { + "epoch": 0.7486714598311972, + "grad_norm": 0.2421875, + "learning_rate": 0.00018331550371982505, + "loss": 1.6727, + "step": 2395 + }, + { + "epoch": 0.7489840575179744, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018330191271122943, + "loss": 1.6077, + "step": 2396 + }, + { + "epoch": 0.7492966552047515, + "grad_norm": 0.244140625, + "learning_rate": 0.0001832883166735912, + "loss": 1.4713, + "step": 2397 + }, + { + "epoch": 0.7496092528915286, + "grad_norm": 0.248046875, + "learning_rate": 0.00018327471560773112, + "loss": 1.9724, + "step": 2398 + }, + { + "epoch": 0.7499218505783057, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018326110951447037, + "loss": 1.852, + "step": 2399 + }, + { + "epoch": 0.7502344482650828, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018324749839463035, + "loss": 1.7013, + "step": 2400 + }, + { + "epoch": 0.75054704595186, + "grad_norm": 0.234375, + "learning_rate": 0.00018323388224903274, + "loss": 2.0012, + "step": 2401 + }, + { + "epoch": 0.7508596436386371, + "grad_norm": 0.2421875, + "learning_rate": 0.0001832202610784996, + "loss": 1.7133, + "step": 2402 + }, + { + "epoch": 0.7511722413254142, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018320663488385327, + "loss": 1.7841, + "step": 2403 + }, + { + "epoch": 0.7514848390121913, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018319300366591637, + "loss": 1.8134, + "step": 2404 + }, + { + "epoch": 0.7517974366989685, + "grad_norm": 0.23046875, + "learning_rate": 0.00018317936742551178, + "loss": 1.5865, + "step": 2405 + }, + { + "epoch": 0.7521100343857455, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001831657261634628, + "loss": 1.6447, + "step": 2406 + }, + { + "epoch": 0.7524226320725227, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018315207988059298, + "loss": 1.4747, + "step": 2407 + }, + { + "epoch": 0.7527352297592997, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001831384285777261, + "loss": 1.7538, + "step": 2408 + }, + { + "epoch": 0.7530478274460769, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018312477225568635, + "loss": 1.6004, + "step": 2409 + }, + { + "epoch": 0.7533604251328541, + "grad_norm": 0.25, + "learning_rate": 0.00018311111091529818, + "loss": 1.6864, + "step": 2410 + }, + { + "epoch": 0.7536730228196311, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018309744455738633, + "loss": 1.8215, + "step": 2411 + }, + { + "epoch": 0.7539856205064083, + "grad_norm": 0.236328125, + "learning_rate": 0.00018308377318277587, + "loss": 1.672, + "step": 2412 + }, + { + "epoch": 0.7542982181931853, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001830700967922921, + "loss": 1.7247, + "step": 2413 + }, + { + "epoch": 0.7546108158799625, + "grad_norm": 0.24609375, + "learning_rate": 0.00018305641538676079, + "loss": 1.6188, + "step": 2414 + }, + { + "epoch": 0.7549234135667396, + "grad_norm": 0.251953125, + "learning_rate": 0.00018304272896700784, + "loss": 1.8593, + "step": 2415 + }, + { + "epoch": 0.7552360112535167, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001830290375338595, + "loss": 1.5332, + "step": 2416 + }, + { + "epoch": 0.7555486089402939, + "grad_norm": 0.224609375, + "learning_rate": 0.00018301534108814234, + "loss": 1.5756, + "step": 2417 + }, + { + "epoch": 0.755861206627071, + "grad_norm": 0.244140625, + "learning_rate": 0.0001830016396306833, + "loss": 2.0714, + "step": 2418 + }, + { + "epoch": 0.7561738043138481, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018298793316230948, + "loss": 1.64, + "step": 2419 + }, + { + "epoch": 0.7564864020006252, + "grad_norm": 0.220703125, + "learning_rate": 0.00018297422168384836, + "loss": 1.5317, + "step": 2420 + }, + { + "epoch": 0.7567989996874023, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018296050519612777, + "loss": 1.8879, + "step": 2421 + }, + { + "epoch": 0.7571115973741794, + "grad_norm": 0.2421875, + "learning_rate": 0.00018294678369997578, + "loss": 1.7005, + "step": 2422 + }, + { + "epoch": 0.7574241950609566, + "grad_norm": 0.376953125, + "learning_rate": 0.00018293305719622072, + "loss": 2.244, + "step": 2423 + }, + { + "epoch": 0.7577367927477336, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018291932568569134, + "loss": 1.5323, + "step": 2424 + }, + { + "epoch": 0.7580493904345108, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018290558916921659, + "loss": 1.6395, + "step": 2425 + }, + { + "epoch": 0.7583619881212879, + "grad_norm": 0.25, + "learning_rate": 0.00018289184764762575, + "loss": 1.648, + "step": 2426 + }, + { + "epoch": 0.758674585808065, + "grad_norm": 0.234375, + "learning_rate": 0.0001828781011217485, + "loss": 1.5622, + "step": 2427 + }, + { + "epoch": 0.7589871834948422, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018286434959241462, + "loss": 1.5481, + "step": 2428 + }, + { + "epoch": 0.7592997811816192, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018285059306045437, + "loss": 1.77, + "step": 2429 + }, + { + "epoch": 0.7596123788683964, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018283683152669824, + "loss": 1.4071, + "step": 2430 + }, + { + "epoch": 0.7599249765551734, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018282306499197703, + "loss": 2.0644, + "step": 2431 + }, + { + "epoch": 0.7602375742419506, + "grad_norm": 0.2421875, + "learning_rate": 0.00018280929345712186, + "loss": 1.7075, + "step": 2432 + }, + { + "epoch": 0.7605501719287278, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001827955169229641, + "loss": 1.3107, + "step": 2433 + }, + { + "epoch": 0.7608627696155048, + "grad_norm": 0.23046875, + "learning_rate": 0.00018278173539033548, + "loss": 1.7646, + "step": 2434 + }, + { + "epoch": 0.761175367302282, + "grad_norm": 0.23828125, + "learning_rate": 0.00018276794886006804, + "loss": 2.0252, + "step": 2435 + }, + { + "epoch": 0.7614879649890591, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018275415733299402, + "loss": 1.5208, + "step": 2436 + }, + { + "epoch": 0.7618005626758362, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018274036080994605, + "loss": 1.8906, + "step": 2437 + }, + { + "epoch": 0.7621131603626133, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018272655929175708, + "loss": 1.8472, + "step": 2438 + }, + { + "epoch": 0.7624257580493904, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001827127527792603, + "loss": 1.7364, + "step": 2439 + }, + { + "epoch": 0.7627383557361676, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018269894127328926, + "loss": 1.8149, + "step": 2440 + }, + { + "epoch": 0.7630509534229447, + "grad_norm": 0.232421875, + "learning_rate": 0.00018268512477467774, + "loss": 1.8335, + "step": 2441 + }, + { + "epoch": 0.7633635511097218, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018267130328425985, + "loss": 1.7762, + "step": 2442 + }, + { + "epoch": 0.7636761487964989, + "grad_norm": 0.2265625, + "learning_rate": 0.00018265747680287008, + "loss": 1.5251, + "step": 2443 + }, + { + "epoch": 0.763988746483276, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018264364533134304, + "loss": 1.5232, + "step": 2444 + }, + { + "epoch": 0.7643013441700531, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018262980887051385, + "loss": 1.5101, + "step": 2445 + }, + { + "epoch": 0.7646139418568303, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018261596742121777, + "loss": 1.6831, + "step": 2446 + }, + { + "epoch": 0.7649265395436073, + "grad_norm": 0.24609375, + "learning_rate": 0.00018260212098429054, + "loss": 1.8748, + "step": 2447 + }, + { + "epoch": 0.7652391372303845, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018258826956056793, + "loss": 1.7539, + "step": 2448 + }, + { + "epoch": 0.7655517349171617, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018257441315088627, + "loss": 1.5779, + "step": 2449 + }, + { + "epoch": 0.7658643326039387, + "grad_norm": 0.2421875, + "learning_rate": 0.00018256055175608205, + "loss": 1.7147, + "step": 2450 + }, + { + "epoch": 0.7661769302907159, + "grad_norm": 0.2265625, + "learning_rate": 0.00018254668537699212, + "loss": 1.682, + "step": 2451 + }, + { + "epoch": 0.7664895279774929, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001825328140144536, + "loss": 1.8002, + "step": 2452 + }, + { + "epoch": 0.7668021256642701, + "grad_norm": 0.234375, + "learning_rate": 0.0001825189376693039, + "loss": 1.3419, + "step": 2453 + }, + { + "epoch": 0.7671147233510472, + "grad_norm": 0.24609375, + "learning_rate": 0.0001825050563423808, + "loss": 1.7038, + "step": 2454 + }, + { + "epoch": 0.7674273210378243, + "grad_norm": 0.232421875, + "learning_rate": 0.00018249117003452234, + "loss": 1.6925, + "step": 2455 + }, + { + "epoch": 0.7677399187246015, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018247727874656683, + "loss": 1.7601, + "step": 2456 + }, + { + "epoch": 0.7680525164113785, + "grad_norm": 0.23828125, + "learning_rate": 0.00018246338247935285, + "loss": 1.6095, + "step": 2457 + }, + { + "epoch": 0.7683651140981557, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001824494812337194, + "loss": 1.4805, + "step": 2458 + }, + { + "epoch": 0.7686777117849328, + "grad_norm": 0.23046875, + "learning_rate": 0.00018243557501050573, + "loss": 1.6642, + "step": 2459 + }, + { + "epoch": 0.7689903094717099, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018242166381055133, + "loss": 1.4541, + "step": 2460 + }, + { + "epoch": 0.769302907158487, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018240774763469606, + "loss": 1.5884, + "step": 2461 + }, + { + "epoch": 0.7696155048452642, + "grad_norm": 0.224609375, + "learning_rate": 0.00018239382648378006, + "loss": 1.6074, + "step": 2462 + }, + { + "epoch": 0.7699281025320412, + "grad_norm": 0.232421875, + "learning_rate": 0.00018237990035864372, + "loss": 1.7759, + "step": 2463 + }, + { + "epoch": 0.7702407002188184, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018236596926012787, + "loss": 1.6379, + "step": 2464 + }, + { + "epoch": 0.7705532979055955, + "grad_norm": 0.248046875, + "learning_rate": 0.00018235203318907347, + "loss": 1.7159, + "step": 2465 + }, + { + "epoch": 0.7708658955923726, + "grad_norm": 0.251953125, + "learning_rate": 0.00018233809214632184, + "loss": 1.6911, + "step": 2466 + }, + { + "epoch": 0.7711784932791498, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018232414613271475, + "loss": 1.422, + "step": 2467 + }, + { + "epoch": 0.7714910909659268, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018231019514909397, + "loss": 1.551, + "step": 2468 + }, + { + "epoch": 0.771803688652704, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018229623919630188, + "loss": 1.8121, + "step": 2469 + }, + { + "epoch": 0.772116286339481, + "grad_norm": 0.2265625, + "learning_rate": 0.00018228227827518095, + "loss": 1.9086, + "step": 2470 + }, + { + "epoch": 0.7724288840262582, + "grad_norm": 0.330078125, + "learning_rate": 0.000182268312386574, + "loss": 2.5265, + "step": 2471 + }, + { + "epoch": 0.7727414817130354, + "grad_norm": 0.234375, + "learning_rate": 0.0001822543415313242, + "loss": 1.8133, + "step": 2472 + }, + { + "epoch": 0.7730540793998124, + "grad_norm": 0.255859375, + "learning_rate": 0.00018224036571027501, + "loss": 1.9486, + "step": 2473 + }, + { + "epoch": 0.7733666770865896, + "grad_norm": 0.240234375, + "learning_rate": 0.0001822263849242701, + "loss": 1.7464, + "step": 2474 + }, + { + "epoch": 0.7736792747733667, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001822123991741536, + "loss": 1.743, + "step": 2475 + }, + { + "epoch": 0.7739918724601438, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018219840846076977, + "loss": 1.4856, + "step": 2476 + }, + { + "epoch": 0.7743044701469209, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018218441278496328, + "loss": 1.7813, + "step": 2477 + }, + { + "epoch": 0.774617067833698, + "grad_norm": 0.234375, + "learning_rate": 0.00018217041214757903, + "loss": 1.7274, + "step": 2478 + }, + { + "epoch": 0.7749296655204752, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018215640654946233, + "loss": 1.5569, + "step": 2479 + }, + { + "epoch": 0.7752422632072523, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018214239599145866, + "loss": 1.5575, + "step": 2480 + }, + { + "epoch": 0.7755548608940294, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018212838047441387, + "loss": 1.5972, + "step": 2481 + }, + { + "epoch": 0.7758674585808065, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001821143599991741, + "loss": 1.668, + "step": 2482 + }, + { + "epoch": 0.7761800562675836, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018210033456658576, + "loss": 1.646, + "step": 2483 + }, + { + "epoch": 0.7764926539543607, + "grad_norm": 0.26171875, + "learning_rate": 0.00018208630417749561, + "loss": 2.3322, + "step": 2484 + }, + { + "epoch": 0.7768052516411379, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018207226883275069, + "loss": 1.5657, + "step": 2485 + }, + { + "epoch": 0.777117849327915, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001820582285331983, + "loss": 1.4964, + "step": 2486 + }, + { + "epoch": 0.7774304470146921, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018204418327968607, + "loss": 1.5711, + "step": 2487 + }, + { + "epoch": 0.7777430447014692, + "grad_norm": 0.26171875, + "learning_rate": 0.00018203013307306195, + "loss": 1.999, + "step": 2488 + }, + { + "epoch": 0.7780556423882463, + "grad_norm": 0.244140625, + "learning_rate": 0.00018201607791417418, + "loss": 1.5581, + "step": 2489 + }, + { + "epoch": 0.7783682400750235, + "grad_norm": 0.23046875, + "learning_rate": 0.00018200201780387126, + "loss": 1.5618, + "step": 2490 + }, + { + "epoch": 0.7786808377618005, + "grad_norm": 0.251953125, + "learning_rate": 0.00018198795274300205, + "loss": 1.6855, + "step": 2491 + }, + { + "epoch": 0.7789934354485777, + "grad_norm": 0.23046875, + "learning_rate": 0.00018197388273241563, + "loss": 1.4388, + "step": 2492 + }, + { + "epoch": 0.7793060331353548, + "grad_norm": 0.23046875, + "learning_rate": 0.00018195980777296146, + "loss": 1.3961, + "step": 2493 + }, + { + "epoch": 0.7796186308221319, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018194572786548924, + "loss": 1.3543, + "step": 2494 + }, + { + "epoch": 0.7799312285089091, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018193164301084905, + "loss": 1.6291, + "step": 2495 + }, + { + "epoch": 0.7802438261956861, + "grad_norm": 0.244140625, + "learning_rate": 0.00018191755320989112, + "loss": 1.8612, + "step": 2496 + }, + { + "epoch": 0.7805564238824633, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018190345846346613, + "loss": 1.507, + "step": 2497 + }, + { + "epoch": 0.7808690215692404, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018188935877242496, + "loss": 1.4034, + "step": 2498 + }, + { + "epoch": 0.7811816192560175, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018187525413761887, + "loss": 1.3682, + "step": 2499 + }, + { + "epoch": 0.7814942169427946, + "grad_norm": 0.25, + "learning_rate": 0.00018186114455989936, + "loss": 1.3907, + "step": 2500 + }, + { + "epoch": 0.7818068146295717, + "grad_norm": 0.251953125, + "learning_rate": 0.00018184703004011822, + "loss": 1.506, + "step": 2501 + }, + { + "epoch": 0.7821194123163488, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018183291057912758, + "loss": 1.6376, + "step": 2502 + }, + { + "epoch": 0.782432010003126, + "grad_norm": 0.23046875, + "learning_rate": 0.00018181878617777985, + "loss": 1.6524, + "step": 2503 + }, + { + "epoch": 0.7827446076899031, + "grad_norm": 0.251953125, + "learning_rate": 0.00018180465683692774, + "loss": 1.6575, + "step": 2504 + }, + { + "epoch": 0.7830572053766802, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018179052255742423, + "loss": 1.6608, + "step": 2505 + }, + { + "epoch": 0.7833698030634574, + "grad_norm": 0.2255859375, + "learning_rate": 0.00018177638334012267, + "loss": 1.7274, + "step": 2506 + }, + { + "epoch": 0.7836824007502344, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018176223918587664, + "loss": 1.7459, + "step": 2507 + }, + { + "epoch": 0.7839949984370116, + "grad_norm": 0.232421875, + "learning_rate": 0.00018174809009554005, + "loss": 1.366, + "step": 2508 + }, + { + "epoch": 0.7843075961237886, + "grad_norm": 0.234375, + "learning_rate": 0.00018173393606996707, + "loss": 1.7907, + "step": 2509 + }, + { + "epoch": 0.7846201938105658, + "grad_norm": 0.240234375, + "learning_rate": 0.0001817197771100122, + "loss": 1.7705, + "step": 2510 + }, + { + "epoch": 0.784932791497343, + "grad_norm": 0.248046875, + "learning_rate": 0.00018170561321653026, + "loss": 1.4995, + "step": 2511 + }, + { + "epoch": 0.78524538918412, + "grad_norm": 0.24609375, + "learning_rate": 0.00018169144439037632, + "loss": 1.6226, + "step": 2512 + }, + { + "epoch": 0.7855579868708972, + "grad_norm": 0.240234375, + "learning_rate": 0.00018167727063240582, + "loss": 1.619, + "step": 2513 + }, + { + "epoch": 0.7858705845576742, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018166309194347438, + "loss": 1.9021, + "step": 2514 + }, + { + "epoch": 0.7861831822444514, + "grad_norm": 0.2275390625, + "learning_rate": 0.000181648908324438, + "loss": 1.9489, + "step": 2515 + }, + { + "epoch": 0.7864957799312285, + "grad_norm": 0.2421875, + "learning_rate": 0.00018163471977615303, + "loss": 1.5399, + "step": 2516 + }, + { + "epoch": 0.7868083776180056, + "grad_norm": 0.236328125, + "learning_rate": 0.000181620526299476, + "loss": 1.5515, + "step": 2517 + }, + { + "epoch": 0.7871209753047828, + "grad_norm": 0.240234375, + "learning_rate": 0.00018160632789526374, + "loss": 1.4493, + "step": 2518 + }, + { + "epoch": 0.7874335729915599, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018159212456437347, + "loss": 1.6494, + "step": 2519 + }, + { + "epoch": 0.787746170678337, + "grad_norm": 0.23828125, + "learning_rate": 0.0001815779163076627, + "loss": 1.7547, + "step": 2520 + }, + { + "epoch": 0.7880587683651141, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018156370312598914, + "loss": 1.7275, + "step": 2521 + }, + { + "epoch": 0.7883713660518912, + "grad_norm": 0.2294921875, + "learning_rate": 0.0001815494850202109, + "loss": 1.3418, + "step": 2522 + }, + { + "epoch": 0.7886839637386683, + "grad_norm": 0.2431640625, + "learning_rate": 0.00018153526199118634, + "loss": 1.5102, + "step": 2523 + }, + { + "epoch": 0.7889965614254455, + "grad_norm": 0.248046875, + "learning_rate": 0.0001815210340397741, + "loss": 2.0452, + "step": 2524 + }, + { + "epoch": 0.7893091591122225, + "grad_norm": 0.25, + "learning_rate": 0.00018150680116683313, + "loss": 1.5017, + "step": 2525 + }, + { + "epoch": 0.7896217567989997, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018149256337322275, + "loss": 2.0215, + "step": 2526 + }, + { + "epoch": 0.7899343544857768, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018147832065980245, + "loss": 1.7694, + "step": 2527 + }, + { + "epoch": 0.7902469521725539, + "grad_norm": 0.240234375, + "learning_rate": 0.00018146407302743208, + "loss": 1.6186, + "step": 2528 + }, + { + "epoch": 0.7905595498593311, + "grad_norm": 0.232421875, + "learning_rate": 0.00018144982047697185, + "loss": 1.7227, + "step": 2529 + }, + { + "epoch": 0.7908721475461081, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018143556300928215, + "loss": 1.6313, + "step": 2530 + }, + { + "epoch": 0.7911847452328853, + "grad_norm": 0.232421875, + "learning_rate": 0.00018142130062522377, + "loss": 1.4294, + "step": 2531 + }, + { + "epoch": 0.7914973429196624, + "grad_norm": 0.23828125, + "learning_rate": 0.00018140703332565768, + "loss": 1.5747, + "step": 2532 + }, + { + "epoch": 0.7918099406064395, + "grad_norm": 0.25390625, + "learning_rate": 0.00018139276111144525, + "loss": 1.6087, + "step": 2533 + }, + { + "epoch": 0.7921225382932167, + "grad_norm": 0.248046875, + "learning_rate": 0.0001813784839834481, + "loss": 1.6986, + "step": 2534 + }, + { + "epoch": 0.7924351359799937, + "grad_norm": 0.251953125, + "learning_rate": 0.00018136420194252818, + "loss": 1.5952, + "step": 2535 + }, + { + "epoch": 0.7927477336667709, + "grad_norm": 0.2392578125, + "learning_rate": 0.00018134991498954773, + "loss": 1.7808, + "step": 2536 + }, + { + "epoch": 0.793060331353548, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001813356231253692, + "loss": 1.518, + "step": 2537 + }, + { + "epoch": 0.7933729290403251, + "grad_norm": 0.2421875, + "learning_rate": 0.0001813213263508555, + "loss": 1.82, + "step": 2538 + }, + { + "epoch": 0.7936855267271022, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001813070246668697, + "loss": 1.5595, + "step": 2539 + }, + { + "epoch": 0.7939981244138793, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018129271807427517, + "loss": 1.8371, + "step": 2540 + }, + { + "epoch": 0.7943107221006565, + "grad_norm": 0.2265625, + "learning_rate": 0.0001812784065739357, + "loss": 1.5297, + "step": 2541 + }, + { + "epoch": 0.7946233197874336, + "grad_norm": 0.236328125, + "learning_rate": 0.0001812640901667152, + "loss": 1.6262, + "step": 2542 + }, + { + "epoch": 0.7949359174742107, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018124976885347806, + "loss": 1.7128, + "step": 2543 + }, + { + "epoch": 0.7952485151609878, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018123544263508884, + "loss": 1.9219, + "step": 2544 + }, + { + "epoch": 0.7955611128477649, + "grad_norm": 0.228515625, + "learning_rate": 0.00018122111151241241, + "loss": 1.5844, + "step": 2545 + }, + { + "epoch": 0.795873710534542, + "grad_norm": 0.2333984375, + "learning_rate": 0.000181206775486314, + "loss": 1.806, + "step": 2546 + }, + { + "epoch": 0.7961863082213192, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018119243455765903, + "loss": 1.648, + "step": 2547 + }, + { + "epoch": 0.7964989059080962, + "grad_norm": 0.251953125, + "learning_rate": 0.00018117808872731336, + "loss": 1.5256, + "step": 2548 + }, + { + "epoch": 0.7968115035948734, + "grad_norm": 0.2421875, + "learning_rate": 0.000181163737996143, + "loss": 1.491, + "step": 2549 + }, + { + "epoch": 0.7971241012816506, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018114938236501438, + "loss": 1.8205, + "step": 2550 + }, + { + "epoch": 0.7974366989684276, + "grad_norm": 0.234375, + "learning_rate": 0.0001811350218347941, + "loss": 1.6017, + "step": 2551 + }, + { + "epoch": 0.7977492966552048, + "grad_norm": 0.240234375, + "learning_rate": 0.0001811206564063492, + "loss": 1.4423, + "step": 2552 + }, + { + "epoch": 0.7980618943419818, + "grad_norm": 0.255859375, + "learning_rate": 0.00018110628608054686, + "loss": 1.8525, + "step": 2553 + }, + { + "epoch": 0.798374492028759, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001810919108582547, + "loss": 1.7098, + "step": 2554 + }, + { + "epoch": 0.7986870897155361, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018107753074034054, + "loss": 1.7347, + "step": 2555 + }, + { + "epoch": 0.7989996874023132, + "grad_norm": 0.244140625, + "learning_rate": 0.00018106314572767252, + "loss": 1.6353, + "step": 2556 + }, + { + "epoch": 0.7993122850890904, + "grad_norm": 0.244140625, + "learning_rate": 0.00018104875582111913, + "loss": 1.7014, + "step": 2557 + }, + { + "epoch": 0.7996248827758674, + "grad_norm": 0.23046875, + "learning_rate": 0.00018103436102154903, + "loss": 1.5313, + "step": 2558 + }, + { + "epoch": 0.7999374804626446, + "grad_norm": 0.24609375, + "learning_rate": 0.0001810199613298313, + "loss": 1.671, + "step": 2559 + }, + { + "epoch": 0.8002500781494217, + "grad_norm": 0.240234375, + "learning_rate": 0.00018100555674683527, + "loss": 1.5859, + "step": 2560 + }, + { + "epoch": 0.8005626758361988, + "grad_norm": 0.232421875, + "learning_rate": 0.00018099114727343057, + "loss": 1.4992, + "step": 2561 + }, + { + "epoch": 0.8008752735229759, + "grad_norm": 0.232421875, + "learning_rate": 0.00018097673291048706, + "loss": 1.6654, + "step": 2562 + }, + { + "epoch": 0.8011878712097531, + "grad_norm": 0.236328125, + "learning_rate": 0.000180962313658875, + "loss": 1.6192, + "step": 2563 + }, + { + "epoch": 0.8015004688965301, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001809478895194649, + "loss": 1.7311, + "step": 2564 + }, + { + "epoch": 0.8018130665833073, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018093346049312758, + "loss": 1.5685, + "step": 2565 + }, + { + "epoch": 0.8021256642700844, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001809190265807341, + "loss": 1.9562, + "step": 2566 + }, + { + "epoch": 0.8024382619568615, + "grad_norm": 0.251953125, + "learning_rate": 0.00018090458778315588, + "loss": 1.662, + "step": 2567 + }, + { + "epoch": 0.8027508596436387, + "grad_norm": 0.251953125, + "learning_rate": 0.00018089014410126457, + "loss": 1.611, + "step": 2568 + }, + { + "epoch": 0.8030634573304157, + "grad_norm": 0.2265625, + "learning_rate": 0.0001808756955359322, + "loss": 1.7113, + "step": 2569 + }, + { + "epoch": 0.8033760550171929, + "grad_norm": 0.234375, + "learning_rate": 0.00018086124208803103, + "loss": 1.3589, + "step": 2570 + }, + { + "epoch": 0.8036886527039699, + "grad_norm": 0.23828125, + "learning_rate": 0.00018084678375843364, + "loss": 1.819, + "step": 2571 + }, + { + "epoch": 0.8040012503907471, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018083232054801288, + "loss": 1.6764, + "step": 2572 + }, + { + "epoch": 0.8043138480775243, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001808178524576419, + "loss": 1.5922, + "step": 2573 + }, + { + "epoch": 0.8046264457643013, + "grad_norm": 0.251953125, + "learning_rate": 0.0001808033794881942, + "loss": 1.5336, + "step": 2574 + }, + { + "epoch": 0.8049390434510785, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001807889016405435, + "loss": 1.443, + "step": 2575 + }, + { + "epoch": 0.8052516411378556, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001807744189155639, + "loss": 1.7123, + "step": 2576 + }, + { + "epoch": 0.8055642388246327, + "grad_norm": 0.24609375, + "learning_rate": 0.00018075993131412966, + "loss": 1.9127, + "step": 2577 + }, + { + "epoch": 0.8058768365114098, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018074543883711547, + "loss": 1.7716, + "step": 2578 + }, + { + "epoch": 0.8061894341981869, + "grad_norm": 0.2470703125, + "learning_rate": 0.00018073094148539625, + "loss": 1.7905, + "step": 2579 + }, + { + "epoch": 0.806502031884964, + "grad_norm": 0.236328125, + "learning_rate": 0.00018071643925984717, + "loss": 1.5217, + "step": 2580 + }, + { + "epoch": 0.8068146295717412, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018070193216134384, + "loss": 1.6451, + "step": 2581 + }, + { + "epoch": 0.8071272272585183, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018068742019076203, + "loss": 1.7439, + "step": 2582 + }, + { + "epoch": 0.8074398249452954, + "grad_norm": 0.25, + "learning_rate": 0.0001806729033489778, + "loss": 2.0439, + "step": 2583 + }, + { + "epoch": 0.8077524226320725, + "grad_norm": 0.263671875, + "learning_rate": 0.0001806583816368676, + "loss": 1.7726, + "step": 2584 + }, + { + "epoch": 0.8080650203188496, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018064385505530813, + "loss": 1.8142, + "step": 2585 + }, + { + "epoch": 0.8083776180056268, + "grad_norm": 0.234375, + "learning_rate": 0.00018062932360517637, + "loss": 1.8507, + "step": 2586 + }, + { + "epoch": 0.8086902156924038, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001806147872873496, + "loss": 1.8861, + "step": 2587 + }, + { + "epoch": 0.809002813379181, + "grad_norm": 0.24609375, + "learning_rate": 0.00018060024610270538, + "loss": 2.04, + "step": 2588 + }, + { + "epoch": 0.8093154110659581, + "grad_norm": 0.23828125, + "learning_rate": 0.0001805857000521216, + "loss": 1.5433, + "step": 2589 + }, + { + "epoch": 0.8096280087527352, + "grad_norm": 0.23828125, + "learning_rate": 0.00018057114913647642, + "loss": 1.5803, + "step": 2590 + }, + { + "epoch": 0.8099406064395124, + "grad_norm": 0.244140625, + "learning_rate": 0.0001805565933566483, + "loss": 1.7928, + "step": 2591 + }, + { + "epoch": 0.8102532041262894, + "grad_norm": 0.25390625, + "learning_rate": 0.00018054203271351599, + "loss": 1.8568, + "step": 2592 + }, + { + "epoch": 0.8105658018130666, + "grad_norm": 0.2353515625, + "learning_rate": 0.00018052746720795848, + "loss": 1.5727, + "step": 2593 + }, + { + "epoch": 0.8108783994998437, + "grad_norm": 0.251953125, + "learning_rate": 0.00018051289684085518, + "loss": 1.543, + "step": 2594 + }, + { + "epoch": 0.8111909971866208, + "grad_norm": 0.2421875, + "learning_rate": 0.00018049832161308574, + "loss": 1.5196, + "step": 2595 + }, + { + "epoch": 0.811503594873398, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018048374152553, + "loss": 1.592, + "step": 2596 + }, + { + "epoch": 0.811816192560175, + "grad_norm": 0.2421875, + "learning_rate": 0.00018046915657906826, + "loss": 1.6238, + "step": 2597 + }, + { + "epoch": 0.8121287902469522, + "grad_norm": 0.2421875, + "learning_rate": 0.00018045456677458094, + "loss": 1.6494, + "step": 2598 + }, + { + "epoch": 0.8124413879337293, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018043997211294896, + "loss": 1.7159, + "step": 2599 + }, + { + "epoch": 0.8127539856205064, + "grad_norm": 0.244140625, + "learning_rate": 0.00018042537259505332, + "loss": 1.7333, + "step": 2600 + }, + { + "epoch": 0.8130665833072835, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018041076822177546, + "loss": 1.7428, + "step": 2601 + }, + { + "epoch": 0.8133791809940606, + "grad_norm": 0.244140625, + "learning_rate": 0.00018039615899399704, + "loss": 1.5266, + "step": 2602 + }, + { + "epoch": 0.8136917786808378, + "grad_norm": 0.2451171875, + "learning_rate": 0.00018038154491260006, + "loss": 1.4482, + "step": 2603 + }, + { + "epoch": 0.8140043763676149, + "grad_norm": 0.25, + "learning_rate": 0.0001803669259784668, + "loss": 1.8164, + "step": 2604 + }, + { + "epoch": 0.814316974054392, + "grad_norm": 0.24609375, + "learning_rate": 0.00018035230219247978, + "loss": 1.7801, + "step": 2605 + }, + { + "epoch": 0.8146295717411691, + "grad_norm": 0.224609375, + "learning_rate": 0.0001803376735555219, + "loss": 1.5818, + "step": 2606 + }, + { + "epoch": 0.8149421694279463, + "grad_norm": 0.236328125, + "learning_rate": 0.0001803230400684763, + "loss": 2.0025, + "step": 2607 + }, + { + "epoch": 0.8152547671147233, + "grad_norm": 0.240234375, + "learning_rate": 0.0001803084017322264, + "loss": 1.6328, + "step": 2608 + }, + { + "epoch": 0.8155673648015005, + "grad_norm": 0.30859375, + "learning_rate": 0.00018029375854765597, + "loss": 2.289, + "step": 2609 + }, + { + "epoch": 0.8158799624882775, + "grad_norm": 0.2333984375, + "learning_rate": 0.00018027911051564897, + "loss": 1.4681, + "step": 2610 + }, + { + "epoch": 0.8161925601750547, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001802644576370898, + "loss": 1.7437, + "step": 2611 + }, + { + "epoch": 0.8165051578618319, + "grad_norm": 0.228515625, + "learning_rate": 0.00018024979991286303, + "loss": 2.0136, + "step": 2612 + }, + { + "epoch": 0.8168177555486089, + "grad_norm": 0.240234375, + "learning_rate": 0.0001802351373438536, + "loss": 1.6401, + "step": 2613 + }, + { + "epoch": 0.8171303532353861, + "grad_norm": 0.2373046875, + "learning_rate": 0.00018022046993094665, + "loss": 1.5986, + "step": 2614 + }, + { + "epoch": 0.8174429509221631, + "grad_norm": 0.228515625, + "learning_rate": 0.00018020579767502774, + "loss": 1.7392, + "step": 2615 + }, + { + "epoch": 0.8177555486089403, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001801911205769826, + "loss": 1.6622, + "step": 2616 + }, + { + "epoch": 0.8180681462957174, + "grad_norm": 0.232421875, + "learning_rate": 0.0001801764386376973, + "loss": 1.6786, + "step": 2617 + }, + { + "epoch": 0.8183807439824945, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001801617518580583, + "loss": 1.6723, + "step": 2618 + }, + { + "epoch": 0.8186933416692717, + "grad_norm": 0.232421875, + "learning_rate": 0.0001801470602389521, + "loss": 1.6344, + "step": 2619 + }, + { + "epoch": 0.8190059393560488, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001801323637812658, + "loss": 1.8773, + "step": 2620 + }, + { + "epoch": 0.8193185370428259, + "grad_norm": 0.2412109375, + "learning_rate": 0.00018011766248588655, + "loss": 1.7633, + "step": 2621 + }, + { + "epoch": 0.819631134729603, + "grad_norm": 0.24609375, + "learning_rate": 0.00018010295635370192, + "loss": 1.7818, + "step": 2622 + }, + { + "epoch": 0.8199437324163801, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018008824538559977, + "loss": 1.5338, + "step": 2623 + }, + { + "epoch": 0.8202563301031572, + "grad_norm": 0.2890625, + "learning_rate": 0.00018007352958246818, + "loss": 2.1521, + "step": 2624 + }, + { + "epoch": 0.8205689277899344, + "grad_norm": 0.2421875, + "learning_rate": 0.00018005880894519555, + "loss": 1.6819, + "step": 2625 + }, + { + "epoch": 0.8208815254767114, + "grad_norm": 0.2314453125, + "learning_rate": 0.00018004408347467062, + "loss": 1.7966, + "step": 2626 + }, + { + "epoch": 0.8211941231634886, + "grad_norm": 0.2275390625, + "learning_rate": 0.00018002935317178235, + "loss": 1.5681, + "step": 2627 + }, + { + "epoch": 0.8215067208502657, + "grad_norm": 0.2490234375, + "learning_rate": 0.00018001461803742008, + "loss": 1.8119, + "step": 2628 + }, + { + "epoch": 0.8218193185370428, + "grad_norm": 0.259765625, + "learning_rate": 0.00017999987807247334, + "loss": 2.2241, + "step": 2629 + }, + { + "epoch": 0.82213191622382, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017998513327783199, + "loss": 1.5033, + "step": 2630 + }, + { + "epoch": 0.822444513910597, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017997038365438628, + "loss": 1.481, + "step": 2631 + }, + { + "epoch": 0.8227571115973742, + "grad_norm": 0.2578125, + "learning_rate": 0.00017995562920302652, + "loss": 1.7684, + "step": 2632 + }, + { + "epoch": 0.8230697092841514, + "grad_norm": 0.251953125, + "learning_rate": 0.0001799408699246436, + "loss": 1.6599, + "step": 2633 + }, + { + "epoch": 0.8233823069709284, + "grad_norm": 0.255859375, + "learning_rate": 0.00017992610582012847, + "loss": 1.3327, + "step": 2634 + }, + { + "epoch": 0.8236949046577056, + "grad_norm": 0.236328125, + "learning_rate": 0.0001799113368903725, + "loss": 1.7121, + "step": 2635 + }, + { + "epoch": 0.8240075023444826, + "grad_norm": 0.22265625, + "learning_rate": 0.00017989656313626727, + "loss": 1.766, + "step": 2636 + }, + { + "epoch": 0.8243201000312598, + "grad_norm": 0.2265625, + "learning_rate": 0.0001798817845587047, + "loss": 1.85, + "step": 2637 + }, + { + "epoch": 0.8246326977180369, + "grad_norm": 0.240234375, + "learning_rate": 0.000179867001158577, + "loss": 1.8962, + "step": 2638 + }, + { + "epoch": 0.824945295404814, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001798522129367767, + "loss": 1.4497, + "step": 2639 + }, + { + "epoch": 0.8252578930915911, + "grad_norm": 0.2421875, + "learning_rate": 0.00017983741989419655, + "loss": 1.6794, + "step": 2640 + }, + { + "epoch": 0.8255704907783682, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001798226220317296, + "loss": 1.718, + "step": 2641 + }, + { + "epoch": 0.8258830884651454, + "grad_norm": 0.23046875, + "learning_rate": 0.00017980781935026925, + "loss": 1.7489, + "step": 2642 + }, + { + "epoch": 0.8261956861519225, + "grad_norm": 0.25390625, + "learning_rate": 0.0001797930118507091, + "loss": 1.7344, + "step": 2643 + }, + { + "epoch": 0.8265082838386996, + "grad_norm": 0.25, + "learning_rate": 0.0001797781995339432, + "loss": 1.7674, + "step": 2644 + }, + { + "epoch": 0.8268208815254767, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001797633824008657, + "loss": 2.0352, + "step": 2645 + }, + { + "epoch": 0.8271334792122538, + "grad_norm": 0.2265625, + "learning_rate": 0.00017974856045237117, + "loss": 1.6354, + "step": 2646 + }, + { + "epoch": 0.8274460768990309, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017973373368935445, + "loss": 1.737, + "step": 2647 + }, + { + "epoch": 0.8277586745858081, + "grad_norm": 0.25, + "learning_rate": 0.00017971890211271059, + "loss": 1.7081, + "step": 2648 + }, + { + "epoch": 0.8280712722725851, + "grad_norm": 0.251953125, + "learning_rate": 0.000179704065723335, + "loss": 1.3865, + "step": 2649 + }, + { + "epoch": 0.8283838699593623, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017968922452212343, + "loss": 1.5347, + "step": 2650 + }, + { + "epoch": 0.8286964676461395, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017967437850997185, + "loss": 1.7372, + "step": 2651 + }, + { + "epoch": 0.8290090653329165, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017965952768777649, + "loss": 1.5994, + "step": 2652 + }, + { + "epoch": 0.8293216630196937, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001796446720564339, + "loss": 1.8905, + "step": 2653 + }, + { + "epoch": 0.8296342607064707, + "grad_norm": 0.3359375, + "learning_rate": 0.00017962981161684098, + "loss": 2.5074, + "step": 2654 + }, + { + "epoch": 0.8299468583932479, + "grad_norm": 0.2421875, + "learning_rate": 0.00017961494636989486, + "loss": 1.9347, + "step": 2655 + }, + { + "epoch": 0.830259456080025, + "grad_norm": 0.24609375, + "learning_rate": 0.00017960007631649298, + "loss": 1.8819, + "step": 2656 + }, + { + "epoch": 0.8305720537668021, + "grad_norm": 0.240234375, + "learning_rate": 0.00017958520145753307, + "loss": 1.6299, + "step": 2657 + }, + { + "epoch": 0.8308846514535793, + "grad_norm": 0.25390625, + "learning_rate": 0.00017957032179391312, + "loss": 1.7028, + "step": 2658 + }, + { + "epoch": 0.8311972491403563, + "grad_norm": 0.23046875, + "learning_rate": 0.00017955543732653143, + "loss": 1.8788, + "step": 2659 + }, + { + "epoch": 0.8315098468271335, + "grad_norm": 0.228515625, + "learning_rate": 0.0001795405480562866, + "loss": 1.7432, + "step": 2660 + }, + { + "epoch": 0.8318224445139106, + "grad_norm": 0.24609375, + "learning_rate": 0.00017952565398407757, + "loss": 1.583, + "step": 2661 + }, + { + "epoch": 0.8321350422006877, + "grad_norm": 0.240234375, + "learning_rate": 0.00017951075511080347, + "loss": 1.7078, + "step": 2662 + }, + { + "epoch": 0.8324476398874648, + "grad_norm": 0.234375, + "learning_rate": 0.0001794958514373637, + "loss": 1.4488, + "step": 2663 + }, + { + "epoch": 0.832760237574242, + "grad_norm": 0.2421875, + "learning_rate": 0.00017948094296465814, + "loss": 1.6082, + "step": 2664 + }, + { + "epoch": 0.833072835261019, + "grad_norm": 0.244140625, + "learning_rate": 0.00017946602969358673, + "loss": 1.6088, + "step": 2665 + }, + { + "epoch": 0.8333854329477962, + "grad_norm": 0.248046875, + "learning_rate": 0.00017945111162504987, + "loss": 1.7525, + "step": 2666 + }, + { + "epoch": 0.8336980306345733, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017943618875994815, + "loss": 1.8168, + "step": 2667 + }, + { + "epoch": 0.8340106283213504, + "grad_norm": 0.244140625, + "learning_rate": 0.00017942126109918248, + "loss": 1.7631, + "step": 2668 + }, + { + "epoch": 0.8343232260081276, + "grad_norm": 0.234375, + "learning_rate": 0.00017940632864365408, + "loss": 1.665, + "step": 2669 + }, + { + "epoch": 0.8346358236949046, + "grad_norm": 0.2265625, + "learning_rate": 0.00017939139139426443, + "loss": 1.7743, + "step": 2670 + }, + { + "epoch": 0.8349484213816818, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001793764493519153, + "loss": 1.6251, + "step": 2671 + }, + { + "epoch": 0.8352610190684588, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017936150251750876, + "loss": 1.5676, + "step": 2672 + }, + { + "epoch": 0.835573616755236, + "grad_norm": 0.244140625, + "learning_rate": 0.0001793465508919472, + "loss": 1.9198, + "step": 2673 + }, + { + "epoch": 0.8358862144420132, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017933159447613325, + "loss": 1.8999, + "step": 2674 + }, + { + "epoch": 0.8361988121287902, + "grad_norm": 0.232421875, + "learning_rate": 0.00017931663327096985, + "loss": 1.5773, + "step": 2675 + }, + { + "epoch": 0.8365114098155674, + "grad_norm": 0.251953125, + "learning_rate": 0.00017930166727736022, + "loss": 1.5615, + "step": 2676 + }, + { + "epoch": 0.8368240075023445, + "grad_norm": 0.2265625, + "learning_rate": 0.0001792866964962079, + "loss": 1.7466, + "step": 2677 + }, + { + "epoch": 0.8371366051891216, + "grad_norm": 0.236328125, + "learning_rate": 0.00017927172092841665, + "loss": 1.5719, + "step": 2678 + }, + { + "epoch": 0.8374492028758987, + "grad_norm": 0.236328125, + "learning_rate": 0.00017925674057489062, + "loss": 1.8351, + "step": 2679 + }, + { + "epoch": 0.8377618005626758, + "grad_norm": 0.2421875, + "learning_rate": 0.00017924175543653412, + "loss": 1.3423, + "step": 2680 + }, + { + "epoch": 0.838074398249453, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001792267655142519, + "loss": 1.8691, + "step": 2681 + }, + { + "epoch": 0.8383869959362301, + "grad_norm": 0.23046875, + "learning_rate": 0.00017921177080894887, + "loss": 1.5727, + "step": 2682 + }, + { + "epoch": 0.8386995936230072, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001791967713215303, + "loss": 1.5138, + "step": 2683 + }, + { + "epoch": 0.8390121913097843, + "grad_norm": 0.244140625, + "learning_rate": 0.00017918176705290174, + "loss": 1.7783, + "step": 2684 + }, + { + "epoch": 0.8393247889965614, + "grad_norm": 0.23828125, + "learning_rate": 0.00017916675800396897, + "loss": 1.8948, + "step": 2685 + }, + { + "epoch": 0.8396373866833385, + "grad_norm": 0.240234375, + "learning_rate": 0.00017915174417563816, + "loss": 1.6654, + "step": 2686 + }, + { + "epoch": 0.8399499843701157, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017913672556881566, + "loss": 1.8393, + "step": 2687 + }, + { + "epoch": 0.8402625820568927, + "grad_norm": 0.232421875, + "learning_rate": 0.00017912170218440822, + "loss": 1.5724, + "step": 2688 + }, + { + "epoch": 0.8405751797436699, + "grad_norm": 0.236328125, + "learning_rate": 0.0001791066740233228, + "loss": 1.5801, + "step": 2689 + }, + { + "epoch": 0.8408877774304471, + "grad_norm": 0.259765625, + "learning_rate": 0.00017909164108646667, + "loss": 1.6645, + "step": 2690 + }, + { + "epoch": 0.8412003751172241, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017907660337474735, + "loss": 1.6794, + "step": 2691 + }, + { + "epoch": 0.8415129728040013, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001790615608890727, + "loss": 1.6382, + "step": 2692 + }, + { + "epoch": 0.8418255704907783, + "grad_norm": 0.24609375, + "learning_rate": 0.00017904651363035093, + "loss": 1.6977, + "step": 2693 + }, + { + "epoch": 0.8421381681775555, + "grad_norm": 0.23828125, + "learning_rate": 0.00017903146159949036, + "loss": 1.4432, + "step": 2694 + }, + { + "epoch": 0.8424507658643327, + "grad_norm": 0.22265625, + "learning_rate": 0.00017901640479739975, + "loss": 1.7628, + "step": 2695 + }, + { + "epoch": 0.8427633635511097, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001790013432249881, + "loss": 1.6406, + "step": 2696 + }, + { + "epoch": 0.8430759612378869, + "grad_norm": 0.37890625, + "learning_rate": 0.00017898627688316468, + "loss": 2.2605, + "step": 2697 + }, + { + "epoch": 0.8433885589246639, + "grad_norm": 0.255859375, + "learning_rate": 0.00017897120577283908, + "loss": 1.6559, + "step": 2698 + }, + { + "epoch": 0.8437011566114411, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017895612989492113, + "loss": 1.7878, + "step": 2699 + }, + { + "epoch": 0.8440137542982182, + "grad_norm": 0.25, + "learning_rate": 0.000178941049250321, + "loss": 1.7082, + "step": 2700 + }, + { + "epoch": 0.8443263519849953, + "grad_norm": 0.228515625, + "learning_rate": 0.00017892596383994915, + "loss": 1.6265, + "step": 2701 + }, + { + "epoch": 0.8446389496717724, + "grad_norm": 0.2421875, + "learning_rate": 0.00017891087366471632, + "loss": 1.6036, + "step": 2702 + }, + { + "epoch": 0.8449515473585495, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017889577872553343, + "loss": 1.4701, + "step": 2703 + }, + { + "epoch": 0.8452641450453267, + "grad_norm": 0.236328125, + "learning_rate": 0.00017888067902331186, + "loss": 1.7345, + "step": 2704 + }, + { + "epoch": 0.8455767427321038, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001788655745589632, + "loss": 1.7042, + "step": 2705 + }, + { + "epoch": 0.8458893404188809, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001788504653333993, + "loss": 1.9033, + "step": 2706 + }, + { + "epoch": 0.846201938105658, + "grad_norm": 0.216796875, + "learning_rate": 0.0001788353513475323, + "loss": 1.6525, + "step": 2707 + }, + { + "epoch": 0.8465145357924352, + "grad_norm": 0.251953125, + "learning_rate": 0.0001788202326022747, + "loss": 1.6119, + "step": 2708 + }, + { + "epoch": 0.8468271334792122, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001788051090985392, + "loss": 1.7473, + "step": 2709 + }, + { + "epoch": 0.8471397311659894, + "grad_norm": 0.244140625, + "learning_rate": 0.00017878998083723885, + "loss": 1.8992, + "step": 2710 + }, + { + "epoch": 0.8474523288527664, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017877484781928698, + "loss": 1.6285, + "step": 2711 + }, + { + "epoch": 0.8477649265395436, + "grad_norm": 0.24609375, + "learning_rate": 0.00017875971004559712, + "loss": 1.671, + "step": 2712 + }, + { + "epoch": 0.8480775242263208, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001787445675170832, + "loss": 1.639, + "step": 2713 + }, + { + "epoch": 0.8483901219130978, + "grad_norm": 0.263671875, + "learning_rate": 0.00017872942023465944, + "loss": 2.2887, + "step": 2714 + }, + { + "epoch": 0.848702719599875, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017871426819924025, + "loss": 1.6424, + "step": 2715 + }, + { + "epoch": 0.849015317286652, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017869911141174034, + "loss": 1.6615, + "step": 2716 + }, + { + "epoch": 0.8493279149734292, + "grad_norm": 0.251953125, + "learning_rate": 0.00017868394987307482, + "loss": 1.8865, + "step": 2717 + }, + { + "epoch": 0.8496405126602063, + "grad_norm": 0.251953125, + "learning_rate": 0.00017866878358415895, + "loss": 1.4584, + "step": 2718 + }, + { + "epoch": 0.8499531103469834, + "grad_norm": 0.236328125, + "learning_rate": 0.0001786536125459084, + "loss": 1.7852, + "step": 2719 + }, + { + "epoch": 0.8502657080337606, + "grad_norm": 0.2392578125, + "learning_rate": 0.000178638436759239, + "loss": 1.5773, + "step": 2720 + }, + { + "epoch": 0.8505783057205377, + "grad_norm": 0.255859375, + "learning_rate": 0.00017862325622506698, + "loss": 1.5571, + "step": 2721 + }, + { + "epoch": 0.8508909034073148, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017860807094430877, + "loss": 1.6325, + "step": 2722 + }, + { + "epoch": 0.8512035010940919, + "grad_norm": 0.2421875, + "learning_rate": 0.0001785928809178812, + "loss": 2.1872, + "step": 2723 + }, + { + "epoch": 0.851516098780869, + "grad_norm": 0.2421875, + "learning_rate": 0.0001785776861467012, + "loss": 1.7218, + "step": 2724 + }, + { + "epoch": 0.8518286964676461, + "grad_norm": 0.232421875, + "learning_rate": 0.00017856248663168618, + "loss": 1.8967, + "step": 2725 + }, + { + "epoch": 0.8521412941544233, + "grad_norm": 0.234375, + "learning_rate": 0.00017854728237375373, + "loss": 1.412, + "step": 2726 + }, + { + "epoch": 0.8524538918412004, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017853207337382174, + "loss": 1.5824, + "step": 2727 + }, + { + "epoch": 0.8527664895279775, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001785168596328084, + "loss": 1.6068, + "step": 2728 + }, + { + "epoch": 0.8530790872147546, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001785016411516322, + "loss": 1.5164, + "step": 2729 + }, + { + "epoch": 0.8533916849015317, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017848641793121188, + "loss": 1.8491, + "step": 2730 + }, + { + "epoch": 0.8537042825883089, + "grad_norm": 0.24609375, + "learning_rate": 0.0001784711899724665, + "loss": 1.6247, + "step": 2731 + }, + { + "epoch": 0.8540168802750859, + "grad_norm": 0.25390625, + "learning_rate": 0.0001784559572763154, + "loss": 1.4966, + "step": 2732 + }, + { + "epoch": 0.8543294779618631, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017844071984367816, + "loss": 1.5311, + "step": 2733 + }, + { + "epoch": 0.8546420756486403, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001784254776754747, + "loss": 1.521, + "step": 2734 + }, + { + "epoch": 0.8549546733354173, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017841023077262523, + "loss": 1.7637, + "step": 2735 + }, + { + "epoch": 0.8552672710221945, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001783949791360502, + "loss": 1.3663, + "step": 2736 + }, + { + "epoch": 0.8555798687089715, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001783797227666704, + "loss": 1.6854, + "step": 2737 + }, + { + "epoch": 0.8558924663957487, + "grad_norm": 0.232421875, + "learning_rate": 0.00017836446166540683, + "loss": 1.7461, + "step": 2738 + }, + { + "epoch": 0.8562050640825258, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017834919583318087, + "loss": 1.5579, + "step": 2739 + }, + { + "epoch": 0.8565176617693029, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017833392527091412, + "loss": 1.8503, + "step": 2740 + }, + { + "epoch": 0.85683025945608, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017831864997952846, + "loss": 1.7036, + "step": 2741 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017830336995994608, + "loss": 1.546, + "step": 2742 + }, + { + "epoch": 0.8574554548296343, + "grad_norm": 0.2421875, + "learning_rate": 0.00017828808521308949, + "loss": 1.7367, + "step": 2743 + }, + { + "epoch": 0.8577680525164114, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017827279573988145, + "loss": 1.6342, + "step": 2744 + }, + { + "epoch": 0.8580806502031885, + "grad_norm": 0.2421875, + "learning_rate": 0.00017825750154124497, + "loss": 1.4992, + "step": 2745 + }, + { + "epoch": 0.8583932478899656, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017824220261810337, + "loss": 1.6274, + "step": 2746 + }, + { + "epoch": 0.8587058455767427, + "grad_norm": 0.25, + "learning_rate": 0.00017822689897138035, + "loss": 1.4625, + "step": 2747 + }, + { + "epoch": 0.8590184432635198, + "grad_norm": 0.232421875, + "learning_rate": 0.00017821159060199974, + "loss": 1.4388, + "step": 2748 + }, + { + "epoch": 0.859331040950297, + "grad_norm": 0.23828125, + "learning_rate": 0.00017819627751088573, + "loss": 1.4505, + "step": 2749 + }, + { + "epoch": 0.859643638637074, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001781809596989628, + "loss": 1.4593, + "step": 2750 + }, + { + "epoch": 0.8599562363238512, + "grad_norm": 0.224609375, + "learning_rate": 0.0001781656371671557, + "loss": 1.5498, + "step": 2751 + }, + { + "epoch": 0.8602688340106284, + "grad_norm": 0.30859375, + "learning_rate": 0.00017815030991638947, + "loss": 2.1876, + "step": 2752 + }, + { + "epoch": 0.8605814316974054, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017813497794758946, + "loss": 1.4955, + "step": 2753 + }, + { + "epoch": 0.8608940293841826, + "grad_norm": 0.236328125, + "learning_rate": 0.00017811964126168123, + "loss": 1.6525, + "step": 2754 + }, + { + "epoch": 0.8612066270709596, + "grad_norm": 0.24609375, + "learning_rate": 0.00017810429985959077, + "loss": 1.7273, + "step": 2755 + }, + { + "epoch": 0.8615192247577368, + "grad_norm": 0.26171875, + "learning_rate": 0.00017808895374224414, + "loss": 1.6337, + "step": 2756 + }, + { + "epoch": 0.861831822444514, + "grad_norm": 0.232421875, + "learning_rate": 0.0001780736029105679, + "loss": 1.572, + "step": 2757 + }, + { + "epoch": 0.862144420131291, + "grad_norm": 0.2421875, + "learning_rate": 0.00017805824736548872, + "loss": 1.7677, + "step": 2758 + }, + { + "epoch": 0.8624570178180682, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017804288710793374, + "loss": 1.4813, + "step": 2759 + }, + { + "epoch": 0.8627696155048452, + "grad_norm": 0.255859375, + "learning_rate": 0.00017802752213883017, + "loss": 1.863, + "step": 2760 + }, + { + "epoch": 0.8630822131916224, + "grad_norm": 0.232421875, + "learning_rate": 0.00017801215245910569, + "loss": 1.7106, + "step": 2761 + }, + { + "epoch": 0.8633948108783995, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017799677806968811, + "loss": 1.5748, + "step": 2762 + }, + { + "epoch": 0.8637074085651766, + "grad_norm": 0.263671875, + "learning_rate": 0.00017798139897150564, + "loss": 1.7248, + "step": 2763 + }, + { + "epoch": 0.8640200062519537, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017796601516548676, + "loss": 1.7132, + "step": 2764 + }, + { + "epoch": 0.8643326039387309, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001779506266525602, + "loss": 1.742, + "step": 2765 + }, + { + "epoch": 0.864645201625508, + "grad_norm": 0.2431640625, + "learning_rate": 0.000177935233433655, + "loss": 1.8706, + "step": 2766 + }, + { + "epoch": 0.8649577993122851, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001779198355097004, + "loss": 1.5686, + "step": 2767 + }, + { + "epoch": 0.8652703969990622, + "grad_norm": 0.234375, + "learning_rate": 0.00017790443288162605, + "loss": 1.7863, + "step": 2768 + }, + { + "epoch": 0.8655829946858393, + "grad_norm": 0.248046875, + "learning_rate": 0.00017788902555036182, + "loss": 1.6466, + "step": 2769 + }, + { + "epoch": 0.8658955923726165, + "grad_norm": 0.26171875, + "learning_rate": 0.00017787361351683786, + "loss": 1.7133, + "step": 2770 + }, + { + "epoch": 0.8662081900593935, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017785819678198462, + "loss": 1.7669, + "step": 2771 + }, + { + "epoch": 0.8665207877461707, + "grad_norm": 0.23046875, + "learning_rate": 0.0001778427753467328, + "loss": 1.7054, + "step": 2772 + }, + { + "epoch": 0.8668333854329477, + "grad_norm": 0.240234375, + "learning_rate": 0.00017782734921201348, + "loss": 1.5878, + "step": 2773 + }, + { + "epoch": 0.8671459831197249, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017781191837875788, + "loss": 1.5847, + "step": 2774 + }, + { + "epoch": 0.8674585808065021, + "grad_norm": 0.240234375, + "learning_rate": 0.0001777964828478976, + "loss": 1.556, + "step": 2775 + }, + { + "epoch": 0.8677711784932791, + "grad_norm": 0.236328125, + "learning_rate": 0.00017778104262036455, + "loss": 1.481, + "step": 2776 + }, + { + "epoch": 0.8680837761800563, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001777655976970908, + "loss": 1.5842, + "step": 2777 + }, + { + "epoch": 0.8683963738668334, + "grad_norm": 0.251953125, + "learning_rate": 0.00017775014807900884, + "loss": 1.6188, + "step": 2778 + }, + { + "epoch": 0.8687089715536105, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017773469376705138, + "loss": 1.7405, + "step": 2779 + }, + { + "epoch": 0.8690215692403876, + "grad_norm": 0.234375, + "learning_rate": 0.00017771923476215138, + "loss": 2.009, + "step": 2780 + }, + { + "epoch": 0.8693341669271647, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017770377106524215, + "loss": 1.5022, + "step": 2781 + }, + { + "epoch": 0.8696467646139419, + "grad_norm": 0.298828125, + "learning_rate": 0.0001776883026772572, + "loss": 2.3243, + "step": 2782 + }, + { + "epoch": 0.869959362300719, + "grad_norm": 0.25, + "learning_rate": 0.00017767282959913047, + "loss": 1.5778, + "step": 2783 + }, + { + "epoch": 0.8702719599874961, + "grad_norm": 0.244140625, + "learning_rate": 0.00017765735183179602, + "loss": 1.648, + "step": 2784 + }, + { + "epoch": 0.8705845576742732, + "grad_norm": 0.23828125, + "learning_rate": 0.00017764186937618828, + "loss": 1.9461, + "step": 2785 + }, + { + "epoch": 0.8708971553610503, + "grad_norm": 0.23828125, + "learning_rate": 0.00017762638223324192, + "loss": 1.6331, + "step": 2786 + }, + { + "epoch": 0.8712097530478274, + "grad_norm": 0.23046875, + "learning_rate": 0.00017761089040389198, + "loss": 1.5506, + "step": 2787 + }, + { + "epoch": 0.8715223507346046, + "grad_norm": 0.236328125, + "learning_rate": 0.00017759539388907366, + "loss": 1.4817, + "step": 2788 + }, + { + "epoch": 0.8718349484213817, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017757989268972257, + "loss": 1.4606, + "step": 2789 + }, + { + "epoch": 0.8721475461081588, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017756438680677445, + "loss": 1.4484, + "step": 2790 + }, + { + "epoch": 0.872460143794936, + "grad_norm": 0.234375, + "learning_rate": 0.00017754887624116548, + "loss": 1.5865, + "step": 2791 + }, + { + "epoch": 0.872772741481713, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017753336099383203, + "loss": 1.514, + "step": 2792 + }, + { + "epoch": 0.8730853391684902, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017751784106571079, + "loss": 1.3963, + "step": 2793 + }, + { + "epoch": 0.8733979368552672, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017750231645773869, + "loss": 1.8982, + "step": 2794 + }, + { + "epoch": 0.8737105345420444, + "grad_norm": 0.232421875, + "learning_rate": 0.00017748678717085297, + "loss": 1.7107, + "step": 2795 + }, + { + "epoch": 0.8740231322288216, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017747125320599118, + "loss": 1.5219, + "step": 2796 + }, + { + "epoch": 0.8743357299155986, + "grad_norm": 0.236328125, + "learning_rate": 0.0001774557145640911, + "loss": 1.6148, + "step": 2797 + }, + { + "epoch": 0.8746483276023758, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017744017124609083, + "loss": 1.4968, + "step": 2798 + }, + { + "epoch": 0.8749609252891528, + "grad_norm": 0.24609375, + "learning_rate": 0.00017742462325292873, + "loss": 1.6438, + "step": 2799 + }, + { + "epoch": 0.87527352297593, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001774090705855435, + "loss": 1.8157, + "step": 2800 + }, + { + "epoch": 0.8755861206627071, + "grad_norm": 0.2314453125, + "learning_rate": 0.000177393513244874, + "loss": 1.8969, + "step": 2801 + }, + { + "epoch": 0.8758987183494842, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001773779512318595, + "loss": 1.7561, + "step": 2802 + }, + { + "epoch": 0.8762113160362613, + "grad_norm": 0.2421875, + "learning_rate": 0.00017736238454743946, + "loss": 1.8387, + "step": 2803 + }, + { + "epoch": 0.8765239137230384, + "grad_norm": 0.2421875, + "learning_rate": 0.0001773468131925537, + "loss": 1.8426, + "step": 2804 + }, + { + "epoch": 0.8768365114098156, + "grad_norm": 0.25, + "learning_rate": 0.00017733123716814225, + "loss": 1.5613, + "step": 2805 + }, + { + "epoch": 0.8771491090965927, + "grad_norm": 0.255859375, + "learning_rate": 0.0001773156564751455, + "loss": 1.9907, + "step": 2806 + }, + { + "epoch": 0.8774617067833698, + "grad_norm": 0.232421875, + "learning_rate": 0.00017730007111450402, + "loss": 1.3814, + "step": 2807 + }, + { + "epoch": 0.8777743044701469, + "grad_norm": 0.23046875, + "learning_rate": 0.00017728448108715874, + "loss": 1.459, + "step": 2808 + }, + { + "epoch": 0.8780869021569241, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017726888639405086, + "loss": 1.6541, + "step": 2809 + }, + { + "epoch": 0.8783994998437011, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017725328703612183, + "loss": 1.6136, + "step": 2810 + }, + { + "epoch": 0.8787120975304783, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017723768301431344, + "loss": 1.9023, + "step": 2811 + }, + { + "epoch": 0.8790246952172553, + "grad_norm": 0.3203125, + "learning_rate": 0.00017722207432956767, + "loss": 2.4062, + "step": 2812 + }, + { + "epoch": 0.8793372929040325, + "grad_norm": 0.232421875, + "learning_rate": 0.00017720646098282687, + "loss": 1.6481, + "step": 2813 + }, + { + "epoch": 0.8796498905908097, + "grad_norm": 0.232421875, + "learning_rate": 0.00017719084297503367, + "loss": 1.7955, + "step": 2814 + }, + { + "epoch": 0.8799624882775867, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001771752203071309, + "loss": 1.7442, + "step": 2815 + }, + { + "epoch": 0.8802750859643639, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001771595929800617, + "loss": 1.9734, + "step": 2816 + }, + { + "epoch": 0.8805876836511409, + "grad_norm": 0.244140625, + "learning_rate": 0.0001771439609947696, + "loss": 1.651, + "step": 2817 + }, + { + "epoch": 0.8809002813379181, + "grad_norm": 0.240234375, + "learning_rate": 0.00017712832435219823, + "loss": 1.6914, + "step": 2818 + }, + { + "epoch": 0.8812128790246953, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017711268305329166, + "loss": 1.9028, + "step": 2819 + }, + { + "epoch": 0.8815254767114723, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017709703709899413, + "loss": 1.7345, + "step": 2820 + }, + { + "epoch": 0.8818380743982495, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017708138649025023, + "loss": 1.8512, + "step": 2821 + }, + { + "epoch": 0.8821506720850266, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001770657312280048, + "loss": 1.6781, + "step": 2822 + }, + { + "epoch": 0.8824632697718037, + "grad_norm": 0.23828125, + "learning_rate": 0.00017705007131320298, + "loss": 1.5084, + "step": 2823 + }, + { + "epoch": 0.8827758674585808, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017703440674679015, + "loss": 1.5801, + "step": 2824 + }, + { + "epoch": 0.8830884651453579, + "grad_norm": 0.2265625, + "learning_rate": 0.00017701873752971206, + "loss": 1.7738, + "step": 2825 + }, + { + "epoch": 0.883401062832135, + "grad_norm": 0.232421875, + "learning_rate": 0.00017700306366291458, + "loss": 1.7093, + "step": 2826 + }, + { + "epoch": 0.8837136605189122, + "grad_norm": 0.23046875, + "learning_rate": 0.00017698738514734406, + "loss": 1.7994, + "step": 2827 + }, + { + "epoch": 0.8840262582056893, + "grad_norm": 0.2216796875, + "learning_rate": 0.00017697170198394696, + "loss": 1.7524, + "step": 2828 + }, + { + "epoch": 0.8843388558924664, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001769560141736702, + "loss": 1.4667, + "step": 2829 + }, + { + "epoch": 0.8846514535792435, + "grad_norm": 0.23828125, + "learning_rate": 0.00017694032171746072, + "loss": 1.4843, + "step": 2830 + }, + { + "epoch": 0.8849640512660206, + "grad_norm": 0.240234375, + "learning_rate": 0.000176924624616266, + "loss": 1.4988, + "step": 2831 + }, + { + "epoch": 0.8852766489527978, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017690892287103367, + "loss": 1.5816, + "step": 2832 + }, + { + "epoch": 0.8855892466395748, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017689321648271166, + "loss": 1.7245, + "step": 2833 + }, + { + "epoch": 0.885901844326352, + "grad_norm": 0.2216796875, + "learning_rate": 0.00017687750545224815, + "loss": 1.7804, + "step": 2834 + }, + { + "epoch": 0.8862144420131292, + "grad_norm": 0.251953125, + "learning_rate": 0.0001768617897805917, + "loss": 1.5097, + "step": 2835 + }, + { + "epoch": 0.8865270396999062, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017684606946869106, + "loss": 1.5496, + "step": 2836 + }, + { + "epoch": 0.8868396373866834, + "grad_norm": 0.236328125, + "learning_rate": 0.00017683034451749526, + "loss": 1.829, + "step": 2837 + }, + { + "epoch": 0.8871522350734604, + "grad_norm": 0.251953125, + "learning_rate": 0.0001768146149279537, + "loss": 1.4844, + "step": 2838 + }, + { + "epoch": 0.8874648327602376, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017679888070101592, + "loss": 1.7066, + "step": 2839 + }, + { + "epoch": 0.8877774304470147, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017678314183763183, + "loss": 1.5307, + "step": 2840 + }, + { + "epoch": 0.8880900281337918, + "grad_norm": 0.240234375, + "learning_rate": 0.00017676739833875164, + "loss": 1.4304, + "step": 2841 + }, + { + "epoch": 0.888402625820569, + "grad_norm": 0.22265625, + "learning_rate": 0.00017675165020532578, + "loss": 1.6068, + "step": 2842 + }, + { + "epoch": 0.888715223507346, + "grad_norm": 0.244140625, + "learning_rate": 0.000176735897438305, + "loss": 1.4709, + "step": 2843 + }, + { + "epoch": 0.8890278211941232, + "grad_norm": 0.23828125, + "learning_rate": 0.00017672014003864033, + "loss": 1.6562, + "step": 2844 + }, + { + "epoch": 0.8893404188809003, + "grad_norm": 0.2265625, + "learning_rate": 0.000176704378007283, + "loss": 1.8352, + "step": 2845 + }, + { + "epoch": 0.8896530165676774, + "grad_norm": 0.259765625, + "learning_rate": 0.0001766886113451846, + "loss": 1.8639, + "step": 2846 + }, + { + "epoch": 0.8899656142544545, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017667284005329708, + "loss": 1.6163, + "step": 2847 + }, + { + "epoch": 0.8902782119412317, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017665706413257245, + "loss": 1.7933, + "step": 2848 + }, + { + "epoch": 0.8905908096280087, + "grad_norm": 0.232421875, + "learning_rate": 0.0001766412835839632, + "loss": 1.6013, + "step": 2849 + }, + { + "epoch": 0.8909034073147859, + "grad_norm": 0.248046875, + "learning_rate": 0.000176625498408422, + "loss": 1.6694, + "step": 2850 + }, + { + "epoch": 0.891216005001563, + "grad_norm": 0.25390625, + "learning_rate": 0.0001766097086069018, + "loss": 1.6816, + "step": 2851 + }, + { + "epoch": 0.8915286026883401, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017659391418035588, + "loss": 1.7289, + "step": 2852 + }, + { + "epoch": 0.8918412003751173, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001765781151297377, + "loss": 1.4146, + "step": 2853 + }, + { + "epoch": 0.8921537980618943, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001765623114560012, + "loss": 1.6338, + "step": 2854 + }, + { + "epoch": 0.8924663957486715, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017654650316010036, + "loss": 1.623, + "step": 2855 + }, + { + "epoch": 0.8927789934354485, + "grad_norm": 0.24609375, + "learning_rate": 0.00017653069024298957, + "loss": 1.6547, + "step": 2856 + }, + { + "epoch": 0.8930915911222257, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001765148727056235, + "loss": 1.7697, + "step": 2857 + }, + { + "epoch": 0.8934041888090029, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017649905054895705, + "loss": 1.7488, + "step": 2858 + }, + { + "epoch": 0.8937167864957799, + "grad_norm": 0.3046875, + "learning_rate": 0.00017648322377394546, + "loss": 2.1237, + "step": 2859 + }, + { + "epoch": 0.8940293841825571, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017646739238154417, + "loss": 1.6839, + "step": 2860 + }, + { + "epoch": 0.8943419818693341, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017645155637270897, + "loss": 1.6423, + "step": 2861 + }, + { + "epoch": 0.8946545795561113, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017643571574839587, + "loss": 1.7184, + "step": 2862 + }, + { + "epoch": 0.8949671772428884, + "grad_norm": 0.251953125, + "learning_rate": 0.00017641987050956122, + "loss": 1.8355, + "step": 2863 + }, + { + "epoch": 0.8952797749296655, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001764040206571616, + "loss": 1.6686, + "step": 2864 + }, + { + "epoch": 0.8955923726164426, + "grad_norm": 0.23046875, + "learning_rate": 0.00017638816619215388, + "loss": 1.7545, + "step": 2865 + }, + { + "epoch": 0.8959049703032198, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017637230711549525, + "loss": 1.7738, + "step": 2866 + }, + { + "epoch": 0.8962175679899969, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001763564434281431, + "loss": 1.7099, + "step": 2867 + }, + { + "epoch": 0.896530165676774, + "grad_norm": 0.23046875, + "learning_rate": 0.00017634057513105515, + "loss": 1.6731, + "step": 2868 + }, + { + "epoch": 0.8968427633635511, + "grad_norm": 0.244140625, + "learning_rate": 0.0001763247022251894, + "loss": 1.4654, + "step": 2869 + }, + { + "epoch": 0.8971553610503282, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017630882471150413, + "loss": 1.7359, + "step": 2870 + }, + { + "epoch": 0.8974679587371054, + "grad_norm": 0.2421875, + "learning_rate": 0.00017629294259095785, + "loss": 1.5702, + "step": 2871 + }, + { + "epoch": 0.8977805564238824, + "grad_norm": 0.26171875, + "learning_rate": 0.00017627705586450944, + "loss": 2.429, + "step": 2872 + }, + { + "epoch": 0.8980931541106596, + "grad_norm": 0.25, + "learning_rate": 0.00017626116453311794, + "loss": 1.8714, + "step": 2873 + }, + { + "epoch": 0.8984057517974366, + "grad_norm": 0.2421875, + "learning_rate": 0.00017624526859774274, + "loss": 1.592, + "step": 2874 + }, + { + "epoch": 0.8987183494842138, + "grad_norm": 0.234375, + "learning_rate": 0.00017622936805934355, + "loss": 1.9351, + "step": 2875 + }, + { + "epoch": 0.899030947170991, + "grad_norm": 0.244140625, + "learning_rate": 0.00017621346291888025, + "loss": 1.5676, + "step": 2876 + }, + { + "epoch": 0.899343544857768, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001761975531773131, + "loss": 2.0676, + "step": 2877 + }, + { + "epoch": 0.8996561425445452, + "grad_norm": 0.234375, + "learning_rate": 0.00017618163883560255, + "loss": 1.8676, + "step": 2878 + }, + { + "epoch": 0.8999687402313223, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017616571989470937, + "loss": 1.6823, + "step": 2879 + }, + { + "epoch": 0.9002813379180994, + "grad_norm": 0.24609375, + "learning_rate": 0.00017614979635559462, + "loss": 1.6829, + "step": 2880 + }, + { + "epoch": 0.9005939356048765, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017613386821921964, + "loss": 1.3811, + "step": 2881 + }, + { + "epoch": 0.9009065332916536, + "grad_norm": 0.259765625, + "learning_rate": 0.00017611793548654602, + "loss": 1.3734, + "step": 2882 + }, + { + "epoch": 0.9012191309784308, + "grad_norm": 0.220703125, + "learning_rate": 0.00017610199815853563, + "loss": 1.8464, + "step": 2883 + }, + { + "epoch": 0.9015317286652079, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017608605623615063, + "loss": 1.4275, + "step": 2884 + }, + { + "epoch": 0.901844326351985, + "grad_norm": 0.232421875, + "learning_rate": 0.00017607010972035348, + "loss": 1.5875, + "step": 2885 + }, + { + "epoch": 0.9021569240387621, + "grad_norm": 0.236328125, + "learning_rate": 0.00017605415861210685, + "loss": 1.8575, + "step": 2886 + }, + { + "epoch": 0.9024695217255392, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017603820291237375, + "loss": 1.8156, + "step": 2887 + }, + { + "epoch": 0.9027821194123163, + "grad_norm": 0.240234375, + "learning_rate": 0.00017602224262211743, + "loss": 1.4908, + "step": 2888 + }, + { + "epoch": 0.9030947170990935, + "grad_norm": 0.244140625, + "learning_rate": 0.00017600627774230144, + "loss": 1.7584, + "step": 2889 + }, + { + "epoch": 0.9034073147858706, + "grad_norm": 0.25, + "learning_rate": 0.00017599030827388965, + "loss": 1.7706, + "step": 2890 + }, + { + "epoch": 0.9037199124726477, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001759743342178461, + "loss": 1.6771, + "step": 2891 + }, + { + "epoch": 0.9040325101594249, + "grad_norm": 0.251953125, + "learning_rate": 0.00017595835557513516, + "loss": 1.838, + "step": 2892 + }, + { + "epoch": 0.9043451078462019, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017594237234672152, + "loss": 1.7833, + "step": 2893 + }, + { + "epoch": 0.9046577055329791, + "grad_norm": 0.244140625, + "learning_rate": 0.00017592638453357005, + "loss": 1.8564, + "step": 2894 + }, + { + "epoch": 0.9049703032197561, + "grad_norm": 0.236328125, + "learning_rate": 0.000175910392136646, + "loss": 1.4054, + "step": 2895 + }, + { + "epoch": 0.9052829009065333, + "grad_norm": 0.234375, + "learning_rate": 0.00017589439515691487, + "loss": 1.7344, + "step": 2896 + }, + { + "epoch": 0.9055954985933105, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001758783935953424, + "loss": 1.6391, + "step": 2897 + }, + { + "epoch": 0.9059080962800875, + "grad_norm": 0.236328125, + "learning_rate": 0.00017586238745289457, + "loss": 1.6244, + "step": 2898 + }, + { + "epoch": 0.9062206939668647, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017584637673053778, + "loss": 1.6056, + "step": 2899 + }, + { + "epoch": 0.9065332916536417, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017583036142923856, + "loss": 1.7858, + "step": 2900 + }, + { + "epoch": 0.9068458893404189, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001758143415499638, + "loss": 1.6028, + "step": 2901 + }, + { + "epoch": 0.907158487027196, + "grad_norm": 0.23828125, + "learning_rate": 0.0001757983170936806, + "loss": 1.6918, + "step": 2902 + }, + { + "epoch": 0.9074710847139731, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017578228806135643, + "loss": 1.9901, + "step": 2903 + }, + { + "epoch": 0.9077836824007502, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017576625445395893, + "loss": 1.5383, + "step": 2904 + }, + { + "epoch": 0.9080962800875274, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017575021627245612, + "loss": 1.5068, + "step": 2905 + }, + { + "epoch": 0.9084088777743045, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017573417351781625, + "loss": 1.8062, + "step": 2906 + }, + { + "epoch": 0.9087214754610816, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017571812619100778, + "loss": 1.4791, + "step": 2907 + }, + { + "epoch": 0.9090340731478587, + "grad_norm": 0.255859375, + "learning_rate": 0.00017570207429299956, + "loss": 1.7496, + "step": 2908 + }, + { + "epoch": 0.9093466708346358, + "grad_norm": 0.244140625, + "learning_rate": 0.00017568601782476064, + "loss": 1.5202, + "step": 2909 + }, + { + "epoch": 0.909659268521413, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017566995678726038, + "loss": 1.6579, + "step": 2910 + }, + { + "epoch": 0.90997186620819, + "grad_norm": 0.251953125, + "learning_rate": 0.0001756538911814684, + "loss": 1.606, + "step": 2911 + }, + { + "epoch": 0.9102844638949672, + "grad_norm": 0.244140625, + "learning_rate": 0.0001756378210083546, + "loss": 1.6417, + "step": 2912 + }, + { + "epoch": 0.9105970615817442, + "grad_norm": 0.232421875, + "learning_rate": 0.00017562174626888918, + "loss": 1.6654, + "step": 2913 + }, + { + "epoch": 0.9109096592685214, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017560566696404254, + "loss": 1.676, + "step": 2914 + }, + { + "epoch": 0.9112222569552986, + "grad_norm": 0.240234375, + "learning_rate": 0.00017558958309478543, + "loss": 1.5845, + "step": 2915 + }, + { + "epoch": 0.9115348546420756, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001755734946620889, + "loss": 1.5907, + "step": 2916 + }, + { + "epoch": 0.9118474523288528, + "grad_norm": 0.23828125, + "learning_rate": 0.00017555740166692418, + "loss": 1.8526, + "step": 2917 + }, + { + "epoch": 0.9121600500156298, + "grad_norm": 0.255859375, + "learning_rate": 0.00017554130411026283, + "loss": 1.4743, + "step": 2918 + }, + { + "epoch": 0.912472647702407, + "grad_norm": 0.2421875, + "learning_rate": 0.0001755252019930767, + "loss": 1.4929, + "step": 2919 + }, + { + "epoch": 0.9127852453891842, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001755090953163379, + "loss": 1.4583, + "step": 2920 + }, + { + "epoch": 0.9130978430759612, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017549298408101876, + "loss": 1.7967, + "step": 2921 + }, + { + "epoch": 0.9134104407627384, + "grad_norm": 0.244140625, + "learning_rate": 0.00017547686828809196, + "loss": 1.9172, + "step": 2922 + }, + { + "epoch": 0.9137230384495155, + "grad_norm": 0.244140625, + "learning_rate": 0.00017546074793853048, + "loss": 1.5975, + "step": 2923 + }, + { + "epoch": 0.9140356361362926, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017544462303330748, + "loss": 1.8838, + "step": 2924 + }, + { + "epoch": 0.9143482338230697, + "grad_norm": 0.2421875, + "learning_rate": 0.00017542849357339644, + "loss": 1.8619, + "step": 2925 + }, + { + "epoch": 0.9146608315098468, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017541235955977112, + "loss": 1.6366, + "step": 2926 + }, + { + "epoch": 0.9149734291966239, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017539622099340554, + "loss": 1.5817, + "step": 2927 + }, + { + "epoch": 0.9152860268834011, + "grad_norm": 0.248046875, + "learning_rate": 0.000175380077875274, + "loss": 1.5323, + "step": 2928 + }, + { + "epoch": 0.9155986245701782, + "grad_norm": 0.240234375, + "learning_rate": 0.00017536393020635118, + "loss": 1.762, + "step": 2929 + }, + { + "epoch": 0.9159112222569553, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001753477779876118, + "loss": 1.5217, + "step": 2930 + }, + { + "epoch": 0.9162238199437324, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017533162122003107, + "loss": 1.6377, + "step": 2931 + }, + { + "epoch": 0.9165364176305095, + "grad_norm": 0.25, + "learning_rate": 0.00017531545990458436, + "loss": 1.5614, + "step": 2932 + }, + { + "epoch": 0.9168490153172867, + "grad_norm": 0.25390625, + "learning_rate": 0.00017529929404224733, + "loss": 1.9785, + "step": 2933 + }, + { + "epoch": 0.9171616130040637, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017528312363399598, + "loss": 1.6278, + "step": 2934 + }, + { + "epoch": 0.9174742106908409, + "grad_norm": 0.2421875, + "learning_rate": 0.00017526694868080656, + "loss": 1.62, + "step": 2935 + }, + { + "epoch": 0.9177868083776181, + "grad_norm": 0.244140625, + "learning_rate": 0.0001752507691836555, + "loss": 1.66, + "step": 2936 + }, + { + "epoch": 0.9180994060643951, + "grad_norm": 0.234375, + "learning_rate": 0.00017523458514351963, + "loss": 1.711, + "step": 2937 + }, + { + "epoch": 0.9184120037511723, + "grad_norm": 0.236328125, + "learning_rate": 0.00017521839656137598, + "loss": 1.606, + "step": 2938 + }, + { + "epoch": 0.9187246014379493, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017520220343820184, + "loss": 1.8548, + "step": 2939 + }, + { + "epoch": 0.9190371991247265, + "grad_norm": 0.26171875, + "learning_rate": 0.00017518600577497487, + "loss": 1.6217, + "step": 2940 + }, + { + "epoch": 0.9193497968115036, + "grad_norm": 0.326171875, + "learning_rate": 0.00017516980357267295, + "loss": 2.4887, + "step": 2941 + }, + { + "epoch": 0.9196623944982807, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017515359683227416, + "loss": 1.7841, + "step": 2942 + }, + { + "epoch": 0.9199749921850578, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017513738555475697, + "loss": 1.7065, + "step": 2943 + }, + { + "epoch": 0.9202875898718349, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001751211697411001, + "loss": 1.7469, + "step": 2944 + }, + { + "epoch": 0.9206001875586121, + "grad_norm": 0.228515625, + "learning_rate": 0.00017510494939228246, + "loss": 1.5839, + "step": 2945 + }, + { + "epoch": 0.9209127852453892, + "grad_norm": 0.24609375, + "learning_rate": 0.0001750887245092833, + "loss": 1.7413, + "step": 2946 + }, + { + "epoch": 0.9212253829321663, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017507249509308217, + "loss": 1.433, + "step": 2947 + }, + { + "epoch": 0.9215379806189434, + "grad_norm": 0.244140625, + "learning_rate": 0.00017505626114465886, + "loss": 1.5907, + "step": 2948 + }, + { + "epoch": 0.9218505783057206, + "grad_norm": 0.25, + "learning_rate": 0.0001750400226649934, + "loss": 1.6737, + "step": 2949 + }, + { + "epoch": 0.9221631759924976, + "grad_norm": 0.25390625, + "learning_rate": 0.00017502377965506613, + "loss": 1.5084, + "step": 2950 + }, + { + "epoch": 0.9224757736792748, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017500753211585772, + "loss": 1.4999, + "step": 2951 + }, + { + "epoch": 0.9227883713660519, + "grad_norm": 0.2451171875, + "learning_rate": 0.000174991280048349, + "loss": 1.6843, + "step": 2952 + }, + { + "epoch": 0.923100969052829, + "grad_norm": 0.2421875, + "learning_rate": 0.00017497502345352112, + "loss": 1.6222, + "step": 2953 + }, + { + "epoch": 0.9234135667396062, + "grad_norm": 0.23828125, + "learning_rate": 0.00017495876233235554, + "loss": 1.5935, + "step": 2954 + }, + { + "epoch": 0.9237261644263832, + "grad_norm": 0.25390625, + "learning_rate": 0.000174942496685834, + "loss": 1.9363, + "step": 2955 + }, + { + "epoch": 0.9240387621131604, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017492622651493837, + "loss": 1.8212, + "step": 2956 + }, + { + "epoch": 0.9243513597999374, + "grad_norm": 0.255859375, + "learning_rate": 0.000174909951820651, + "loss": 1.8014, + "step": 2957 + }, + { + "epoch": 0.9246639574867146, + "grad_norm": 0.251953125, + "learning_rate": 0.00017489367260395438, + "loss": 1.7982, + "step": 2958 + }, + { + "epoch": 0.9249765551734918, + "grad_norm": 0.240234375, + "learning_rate": 0.0001748773888658313, + "loss": 1.6039, + "step": 2959 + }, + { + "epoch": 0.9252891528602688, + "grad_norm": 0.24609375, + "learning_rate": 0.00017486110060726485, + "loss": 1.8941, + "step": 2960 + }, + { + "epoch": 0.925601750547046, + "grad_norm": 0.25, + "learning_rate": 0.00017484480782923835, + "loss": 2.0574, + "step": 2961 + }, + { + "epoch": 0.925914348233823, + "grad_norm": 0.244140625, + "learning_rate": 0.00017482851053273542, + "loss": 1.404, + "step": 2962 + }, + { + "epoch": 0.9262269459206002, + "grad_norm": 0.23046875, + "learning_rate": 0.00017481220871873996, + "loss": 1.6843, + "step": 2963 + }, + { + "epoch": 0.9265395436073773, + "grad_norm": 0.263671875, + "learning_rate": 0.00017479590238823613, + "loss": 1.61, + "step": 2964 + }, + { + "epoch": 0.9268521412941544, + "grad_norm": 0.388671875, + "learning_rate": 0.00017477959154220835, + "loss": 2.4723, + "step": 2965 + }, + { + "epoch": 0.9271647389809315, + "grad_norm": 0.240234375, + "learning_rate": 0.0001747632761816413, + "loss": 1.6597, + "step": 2966 + }, + { + "epoch": 0.9274773366677087, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017474695630752008, + "loss": 1.5784, + "step": 2967 + }, + { + "epoch": 0.9277899343544858, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017473063192082982, + "loss": 1.8403, + "step": 2968 + }, + { + "epoch": 0.9281025320412629, + "grad_norm": 0.25, + "learning_rate": 0.00017471430302255604, + "loss": 1.8024, + "step": 2969 + }, + { + "epoch": 0.92841512972804, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017469796961368462, + "loss": 1.714, + "step": 2970 + }, + { + "epoch": 0.9287277274148171, + "grad_norm": 0.251953125, + "learning_rate": 0.00017468163169520156, + "loss": 1.4359, + "step": 2971 + }, + { + "epoch": 0.9290403251015943, + "grad_norm": 0.23828125, + "learning_rate": 0.00017466528926809324, + "loss": 1.6177, + "step": 2972 + }, + { + "epoch": 0.9293529227883713, + "grad_norm": 0.2578125, + "learning_rate": 0.00017464894233334627, + "loss": 1.9172, + "step": 2973 + }, + { + "epoch": 0.9296655204751485, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017463259089194752, + "loss": 2.023, + "step": 2974 + }, + { + "epoch": 0.9299781181619255, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017461623494488416, + "loss": 1.3345, + "step": 2975 + }, + { + "epoch": 0.9302907158487027, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001745998744931436, + "loss": 1.6451, + "step": 2976 + }, + { + "epoch": 0.9306033135354799, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017458350953771355, + "loss": 1.4398, + "step": 2977 + }, + { + "epoch": 0.9309159112222569, + "grad_norm": 0.236328125, + "learning_rate": 0.000174567140079582, + "loss": 1.4698, + "step": 2978 + }, + { + "epoch": 0.9312285089090341, + "grad_norm": 0.2421875, + "learning_rate": 0.00017455076611973716, + "loss": 1.586, + "step": 2979 + }, + { + "epoch": 0.9315411065958112, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017453438765916758, + "loss": 1.4608, + "step": 2980 + }, + { + "epoch": 0.9318537042825883, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017451800469886207, + "loss": 1.7327, + "step": 2981 + }, + { + "epoch": 0.9321663019693655, + "grad_norm": 0.232421875, + "learning_rate": 0.0001745016172398096, + "loss": 1.7701, + "step": 2982 + }, + { + "epoch": 0.9324788996561425, + "grad_norm": 0.2421875, + "learning_rate": 0.0001744852252829996, + "loss": 1.6054, + "step": 2983 + }, + { + "epoch": 0.9327914973429197, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017446882882942162, + "loss": 1.7484, + "step": 2984 + }, + { + "epoch": 0.9331040950296968, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017445242788006552, + "loss": 1.6647, + "step": 2985 + }, + { + "epoch": 0.9334166927164739, + "grad_norm": 0.248046875, + "learning_rate": 0.0001744360224359215, + "loss": 1.6536, + "step": 2986 + }, + { + "epoch": 0.933729290403251, + "grad_norm": 0.25, + "learning_rate": 0.00017441961249797995, + "loss": 1.9033, + "step": 2987 + }, + { + "epoch": 0.9340418880900281, + "grad_norm": 0.24609375, + "learning_rate": 0.00017440319806723157, + "loss": 1.5145, + "step": 2988 + }, + { + "epoch": 0.9343544857768052, + "grad_norm": 0.25390625, + "learning_rate": 0.0001743867791446673, + "loss": 1.6766, + "step": 2989 + }, + { + "epoch": 0.9346670834635824, + "grad_norm": 0.232421875, + "learning_rate": 0.00017437035573127836, + "loss": 1.5665, + "step": 2990 + }, + { + "epoch": 0.9349796811503595, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017435392782805628, + "loss": 1.7932, + "step": 2991 + }, + { + "epoch": 0.9352922788371366, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017433749543599287, + "loss": 1.595, + "step": 2992 + }, + { + "epoch": 0.9356048765239138, + "grad_norm": 0.228515625, + "learning_rate": 0.00017432105855608008, + "loss": 1.7333, + "step": 2993 + }, + { + "epoch": 0.9359174742106908, + "grad_norm": 0.240234375, + "learning_rate": 0.0001743046171893103, + "loss": 1.6385, + "step": 2994 + }, + { + "epoch": 0.936230071897468, + "grad_norm": 0.25390625, + "learning_rate": 0.0001742881713366761, + "loss": 1.7989, + "step": 2995 + }, + { + "epoch": 0.936542669584245, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017427172099917032, + "loss": 1.5065, + "step": 2996 + }, + { + "epoch": 0.9368552672710222, + "grad_norm": 0.244140625, + "learning_rate": 0.0001742552661777861, + "loss": 1.6564, + "step": 2997 + }, + { + "epoch": 0.9371678649577994, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017423880687351685, + "loss": 1.5779, + "step": 2998 + }, + { + "epoch": 0.9374804626445764, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001742223430873562, + "loss": 1.7974, + "step": 2999 + }, + { + "epoch": 0.9377930603313536, + "grad_norm": 0.234375, + "learning_rate": 0.0001742058748202981, + "loss": 1.4744, + "step": 3000 + }, + { + "epoch": 0.9381056580181306, + "grad_norm": 0.236328125, + "learning_rate": 0.0001741894020733368, + "loss": 1.6008, + "step": 3001 + }, + { + "epoch": 0.9384182557049078, + "grad_norm": 0.248046875, + "learning_rate": 0.00017417292484746676, + "loss": 1.5435, + "step": 3002 + }, + { + "epoch": 0.9387308533916849, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017415644314368274, + "loss": 1.6641, + "step": 3003 + }, + { + "epoch": 0.939043451078462, + "grad_norm": 0.244140625, + "learning_rate": 0.00017413995696297972, + "loss": 1.661, + "step": 3004 + }, + { + "epoch": 0.9393560487652391, + "grad_norm": 0.248046875, + "learning_rate": 0.00017412346630635303, + "loss": 1.5462, + "step": 3005 + }, + { + "epoch": 0.9396686464520163, + "grad_norm": 0.263671875, + "learning_rate": 0.00017410697117479823, + "loss": 1.7804, + "step": 3006 + }, + { + "epoch": 0.9399812441387934, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017409047156931114, + "loss": 1.8893, + "step": 3007 + }, + { + "epoch": 0.9402938418255705, + "grad_norm": 0.248046875, + "learning_rate": 0.00017407396749088787, + "loss": 1.5371, + "step": 3008 + }, + { + "epoch": 0.9406064395123476, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017405745894052477, + "loss": 1.5866, + "step": 3009 + }, + { + "epoch": 0.9409190371991247, + "grad_norm": 0.24609375, + "learning_rate": 0.00017404094591921853, + "loss": 1.5388, + "step": 3010 + }, + { + "epoch": 0.9412316348859019, + "grad_norm": 0.25390625, + "learning_rate": 0.00017402442842796604, + "loss": 1.438, + "step": 3011 + }, + { + "epoch": 0.9415442325726789, + "grad_norm": 0.251953125, + "learning_rate": 0.00017400790646776443, + "loss": 1.892, + "step": 3012 + }, + { + "epoch": 0.9418568302594561, + "grad_norm": 0.24609375, + "learning_rate": 0.00017399138003961124, + "loss": 1.4763, + "step": 3013 + }, + { + "epoch": 0.9421694279462332, + "grad_norm": 0.25, + "learning_rate": 0.0001739748491445041, + "loss": 1.6418, + "step": 3014 + }, + { + "epoch": 0.9424820256330103, + "grad_norm": 0.240234375, + "learning_rate": 0.00017395831378344112, + "loss": 1.7746, + "step": 3015 + }, + { + "epoch": 0.9427946233197875, + "grad_norm": 0.236328125, + "learning_rate": 0.00017394177395742047, + "loss": 1.8002, + "step": 3016 + }, + { + "epoch": 0.9431072210065645, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017392522966744068, + "loss": 1.686, + "step": 3017 + }, + { + "epoch": 0.9434198186933417, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017390868091450055, + "loss": 1.6964, + "step": 3018 + }, + { + "epoch": 0.9437324163801187, + "grad_norm": 0.240234375, + "learning_rate": 0.00017389212769959922, + "loss": 1.656, + "step": 3019 + }, + { + "epoch": 0.9440450140668959, + "grad_norm": 0.240234375, + "learning_rate": 0.00017387557002373596, + "loss": 1.6357, + "step": 3020 + }, + { + "epoch": 0.944357611753673, + "grad_norm": 0.236328125, + "learning_rate": 0.00017385900788791038, + "loss": 1.8136, + "step": 3021 + }, + { + "epoch": 0.9446702094404501, + "grad_norm": 0.236328125, + "learning_rate": 0.00017384244129312239, + "loss": 1.5841, + "step": 3022 + }, + { + "epoch": 0.9449828071272273, + "grad_norm": 0.23828125, + "learning_rate": 0.00017382587024037212, + "loss": 1.5595, + "step": 3023 + }, + { + "epoch": 0.9452954048140044, + "grad_norm": 0.248046875, + "learning_rate": 0.00017380929473066, + "loss": 1.6447, + "step": 3024 + }, + { + "epoch": 0.9456080025007815, + "grad_norm": 0.2421875, + "learning_rate": 0.00017379271476498665, + "loss": 1.6323, + "step": 3025 + }, + { + "epoch": 0.9459206001875586, + "grad_norm": 0.25, + "learning_rate": 0.00017377613034435315, + "loss": 1.62, + "step": 3026 + }, + { + "epoch": 0.9462331978743357, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017375954146976058, + "loss": 1.4751, + "step": 3027 + }, + { + "epoch": 0.9465457955611128, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017374294814221055, + "loss": 2.2368, + "step": 3028 + }, + { + "epoch": 0.94685839324789, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017372635036270472, + "loss": 1.7495, + "step": 3029 + }, + { + "epoch": 0.9471709909346671, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001737097481322452, + "loss": 1.9299, + "step": 3030 + }, + { + "epoch": 0.9474835886214442, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017369314145183426, + "loss": 1.5842, + "step": 3031 + }, + { + "epoch": 0.9477961863082213, + "grad_norm": 0.240234375, + "learning_rate": 0.00017367653032247446, + "loss": 1.6439, + "step": 3032 + }, + { + "epoch": 0.9481087839949984, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736599147451686, + "loss": 1.7489, + "step": 3033 + }, + { + "epoch": 0.9484213816817756, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017364329472091986, + "loss": 1.6981, + "step": 3034 + }, + { + "epoch": 0.9487339793685526, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736266702507316, + "loss": 1.98, + "step": 3035 + }, + { + "epoch": 0.9490465770553298, + "grad_norm": 0.24609375, + "learning_rate": 0.0001736100413356074, + "loss": 1.5686, + "step": 3036 + }, + { + "epoch": 0.949359174742107, + "grad_norm": 0.2421875, + "learning_rate": 0.00017359340797655116, + "loss": 1.6756, + "step": 3037 + }, + { + "epoch": 0.949671772428884, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017357677017456715, + "loss": 1.6345, + "step": 3038 + }, + { + "epoch": 0.9499843701156612, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017356012793065976, + "loss": 1.6958, + "step": 3039 + }, + { + "epoch": 0.9502969678024382, + "grad_norm": 0.234375, + "learning_rate": 0.0001735434812458337, + "loss": 1.6856, + "step": 3040 + }, + { + "epoch": 0.9506095654892154, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017352683012109395, + "loss": 1.6888, + "step": 3041 + }, + { + "epoch": 0.9509221631759925, + "grad_norm": 0.25, + "learning_rate": 0.0001735101745574458, + "loss": 1.7944, + "step": 3042 + }, + { + "epoch": 0.9512347608627696, + "grad_norm": 0.244140625, + "learning_rate": 0.0001734935145558947, + "loss": 1.4633, + "step": 3043 + }, + { + "epoch": 0.9515473585495468, + "grad_norm": 0.251953125, + "learning_rate": 0.0001734768501174465, + "loss": 1.5549, + "step": 3044 + }, + { + "epoch": 0.9518599562363238, + "grad_norm": 0.24609375, + "learning_rate": 0.00017346018124310723, + "loss": 1.6942, + "step": 3045 + }, + { + "epoch": 0.952172553923101, + "grad_norm": 0.232421875, + "learning_rate": 0.0001734435079338832, + "loss": 1.8094, + "step": 3046 + }, + { + "epoch": 0.9524851516098781, + "grad_norm": 0.244140625, + "learning_rate": 0.00017342683019078102, + "loss": 1.6422, + "step": 3047 + }, + { + "epoch": 0.9527977492966552, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017341014801480748, + "loss": 1.4798, + "step": 3048 + }, + { + "epoch": 0.9531103469834323, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001733934614069698, + "loss": 1.6282, + "step": 3049 + }, + { + "epoch": 0.9534229446702095, + "grad_norm": 0.23828125, + "learning_rate": 0.00017337677036827534, + "loss": 1.5165, + "step": 3050 + }, + { + "epoch": 0.9537355423569865, + "grad_norm": 0.248046875, + "learning_rate": 0.00017336007489973171, + "loss": 1.6635, + "step": 3051 + }, + { + "epoch": 0.9540481400437637, + "grad_norm": 0.2578125, + "learning_rate": 0.00017334337500234687, + "loss": 1.7504, + "step": 3052 + }, + { + "epoch": 0.9543607377305408, + "grad_norm": 0.25, + "learning_rate": 0.00017332667067712905, + "loss": 1.8412, + "step": 3053 + }, + { + "epoch": 0.9546733354173179, + "grad_norm": 0.2421875, + "learning_rate": 0.0001733099619250867, + "loss": 1.616, + "step": 3054 + }, + { + "epoch": 0.9549859331040951, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017329324874722847, + "loss": 1.7954, + "step": 3055 + }, + { + "epoch": 0.9552985307908721, + "grad_norm": 0.2421875, + "learning_rate": 0.00017327653114456343, + "loss": 1.6591, + "step": 3056 + }, + { + "epoch": 0.9556111284776493, + "grad_norm": 0.240234375, + "learning_rate": 0.00017325980911810085, + "loss": 1.6327, + "step": 3057 + }, + { + "epoch": 0.9559237261644263, + "grad_norm": 0.24609375, + "learning_rate": 0.00017324308266885026, + "loss": 1.5621, + "step": 3058 + }, + { + "epoch": 0.9562363238512035, + "grad_norm": 0.2578125, + "learning_rate": 0.00017322635179782138, + "loss": 2.0408, + "step": 3059 + }, + { + "epoch": 0.9565489215379807, + "grad_norm": 0.23828125, + "learning_rate": 0.00017320961650602436, + "loss": 1.5293, + "step": 3060 + }, + { + "epoch": 0.9568615192247577, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017319287679446949, + "loss": 1.5787, + "step": 3061 + }, + { + "epoch": 0.9571741169115349, + "grad_norm": 0.23828125, + "learning_rate": 0.0001731761326641674, + "loss": 1.6182, + "step": 3062 + }, + { + "epoch": 0.957486714598312, + "grad_norm": 0.244140625, + "learning_rate": 0.0001731593841161289, + "loss": 1.6671, + "step": 3063 + }, + { + "epoch": 0.9577993122850891, + "grad_norm": 0.23828125, + "learning_rate": 0.00017314263115136516, + "loss": 1.6618, + "step": 3064 + }, + { + "epoch": 0.9581119099718662, + "grad_norm": 0.25, + "learning_rate": 0.00017312587377088756, + "loss": 1.6887, + "step": 3065 + }, + { + "epoch": 0.9584245076586433, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017310911197570777, + "loss": 1.6217, + "step": 3066 + }, + { + "epoch": 0.9587371053454204, + "grad_norm": 0.240234375, + "learning_rate": 0.00017309234576683778, + "loss": 1.7303, + "step": 3067 + }, + { + "epoch": 0.9590497030321976, + "grad_norm": 0.25, + "learning_rate": 0.0001730755751452897, + "loss": 1.6497, + "step": 3068 + }, + { + "epoch": 0.9593623007189747, + "grad_norm": 0.228515625, + "learning_rate": 0.000173058800112076, + "loss": 1.8203, + "step": 3069 + }, + { + "epoch": 0.9596748984057518, + "grad_norm": 0.255859375, + "learning_rate": 0.00017304202066820948, + "loss": 2.1236, + "step": 3070 + }, + { + "epoch": 0.9599874960925289, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001730252368147031, + "loss": 1.7534, + "step": 3071 + }, + { + "epoch": 0.960300093779306, + "grad_norm": 0.25390625, + "learning_rate": 0.00017300844855257008, + "loss": 1.6816, + "step": 3072 + }, + { + "epoch": 0.9606126914660832, + "grad_norm": 0.2373046875, + "learning_rate": 0.000172991655882824, + "loss": 1.5992, + "step": 3073 + }, + { + "epoch": 0.9609252891528602, + "grad_norm": 0.24609375, + "learning_rate": 0.00017297485880647862, + "loss": 1.8889, + "step": 3074 + }, + { + "epoch": 0.9612378868396374, + "grad_norm": 0.240234375, + "learning_rate": 0.00017295805732454804, + "loss": 1.6511, + "step": 3075 + }, + { + "epoch": 0.9615504845264145, + "grad_norm": 0.265625, + "learning_rate": 0.00017294125143804657, + "loss": 1.7686, + "step": 3076 + }, + { + "epoch": 0.9618630822131916, + "grad_norm": 0.26953125, + "learning_rate": 0.0001729244411479888, + "loss": 1.7564, + "step": 3077 + }, + { + "epoch": 0.9621756798999688, + "grad_norm": 0.23046875, + "learning_rate": 0.0001729076264553896, + "loss": 1.6458, + "step": 3078 + }, + { + "epoch": 0.9624882775867458, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017289080736126409, + "loss": 1.698, + "step": 3079 + }, + { + "epoch": 0.962800875273523, + "grad_norm": 0.25, + "learning_rate": 0.00017287398386662764, + "loss": 1.684, + "step": 3080 + }, + { + "epoch": 0.9631134729603001, + "grad_norm": 0.234375, + "learning_rate": 0.0001728571559724959, + "loss": 1.7003, + "step": 3081 + }, + { + "epoch": 0.9634260706470772, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017284032367988482, + "loss": 1.5827, + "step": 3082 + }, + { + "epoch": 0.9637386683338544, + "grad_norm": 0.240234375, + "learning_rate": 0.0001728234869898106, + "loss": 1.7952, + "step": 3083 + }, + { + "epoch": 0.9640512660206314, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017280664590328966, + "loss": 1.5528, + "step": 3084 + }, + { + "epoch": 0.9643638637074086, + "grad_norm": 0.25, + "learning_rate": 0.0001727898004213387, + "loss": 1.8732, + "step": 3085 + }, + { + "epoch": 0.9646764613941857, + "grad_norm": 0.234375, + "learning_rate": 0.00017277295054497478, + "loss": 1.5453, + "step": 3086 + }, + { + "epoch": 0.9649890590809628, + "grad_norm": 0.25, + "learning_rate": 0.00017275609627521508, + "loss": 1.8652, + "step": 3087 + }, + { + "epoch": 0.9653016567677399, + "grad_norm": 0.23828125, + "learning_rate": 0.00017273923761307712, + "loss": 1.5761, + "step": 3088 + }, + { + "epoch": 0.965614254454517, + "grad_norm": 0.232421875, + "learning_rate": 0.00017272237455957868, + "loss": 1.3679, + "step": 3089 + }, + { + "epoch": 0.9659268521412941, + "grad_norm": 0.365234375, + "learning_rate": 0.00017270550711573788, + "loss": 2.1864, + "step": 3090 + }, + { + "epoch": 0.9662394498280713, + "grad_norm": 0.25390625, + "learning_rate": 0.0001726886352825729, + "loss": 1.8203, + "step": 3091 + }, + { + "epoch": 0.9665520475148484, + "grad_norm": 0.236328125, + "learning_rate": 0.0001726717590611024, + "loss": 1.6397, + "step": 3092 + }, + { + "epoch": 0.9668646452016255, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017265487845234524, + "loss": 1.7298, + "step": 3093 + }, + { + "epoch": 0.9671772428884027, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017263799345732043, + "loss": 1.4412, + "step": 3094 + }, + { + "epoch": 0.9674898405751797, + "grad_norm": 0.2578125, + "learning_rate": 0.0001726211040770474, + "loss": 1.6235, + "step": 3095 + }, + { + "epoch": 0.9678024382619569, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001726042103125458, + "loss": 1.4866, + "step": 3096 + }, + { + "epoch": 0.9681150359487339, + "grad_norm": 0.25, + "learning_rate": 0.0001725873121648355, + "loss": 1.8129, + "step": 3097 + }, + { + "epoch": 0.9684276336355111, + "grad_norm": 0.244140625, + "learning_rate": 0.00017257040963493663, + "loss": 1.7193, + "step": 3098 + }, + { + "epoch": 0.9687402313222883, + "grad_norm": 0.2275390625, + "learning_rate": 0.00017255350272386968, + "loss": 1.6863, + "step": 3099 + }, + { + "epoch": 0.9690528290090653, + "grad_norm": 0.2353515625, + "learning_rate": 0.00017253659143265534, + "loss": 1.5868, + "step": 3100 + }, + { + "epoch": 0.9693654266958425, + "grad_norm": 0.26171875, + "learning_rate": 0.00017251967576231448, + "loss": 1.9038, + "step": 3101 + }, + { + "epoch": 0.9696780243826195, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001725027557138684, + "loss": 1.5963, + "step": 3102 + }, + { + "epoch": 0.9699906220693967, + "grad_norm": 0.25390625, + "learning_rate": 0.0001724858312883386, + "loss": 1.9158, + "step": 3103 + }, + { + "epoch": 0.9703032197561738, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001724689024867468, + "loss": 1.7879, + "step": 3104 + }, + { + "epoch": 0.9706158174429509, + "grad_norm": 0.349609375, + "learning_rate": 0.00017245196931011495, + "loss": 2.2104, + "step": 3105 + }, + { + "epoch": 0.970928415129728, + "grad_norm": 0.25390625, + "learning_rate": 0.00017243503175946542, + "loss": 1.3733, + "step": 3106 + }, + { + "epoch": 0.9712410128165052, + "grad_norm": 0.26171875, + "learning_rate": 0.0001724180898358207, + "loss": 1.8072, + "step": 3107 + }, + { + "epoch": 0.9715536105032823, + "grad_norm": 0.25, + "learning_rate": 0.00017240114354020368, + "loss": 1.6554, + "step": 3108 + }, + { + "epoch": 0.9718662081900594, + "grad_norm": 0.232421875, + "learning_rate": 0.0001723841928736373, + "loss": 1.8434, + "step": 3109 + }, + { + "epoch": 0.9721788058768365, + "grad_norm": 0.251953125, + "learning_rate": 0.00017236723783714496, + "loss": 1.7078, + "step": 3110 + }, + { + "epoch": 0.9724914035636136, + "grad_norm": 0.248046875, + "learning_rate": 0.00017235027843175027, + "loss": 1.4973, + "step": 3111 + }, + { + "epoch": 0.9728040012503908, + "grad_norm": 0.23828125, + "learning_rate": 0.00017233331465847705, + "loss": 2.0236, + "step": 3112 + }, + { + "epoch": 0.9731165989371678, + "grad_norm": 0.248046875, + "learning_rate": 0.00017231634651834946, + "loss": 1.55, + "step": 3113 + }, + { + "epoch": 0.973429196623945, + "grad_norm": 0.240234375, + "learning_rate": 0.00017229937401239188, + "loss": 1.5074, + "step": 3114 + }, + { + "epoch": 0.973741794310722, + "grad_norm": 0.232421875, + "learning_rate": 0.00017228239714162896, + "loss": 1.4308, + "step": 3115 + }, + { + "epoch": 0.9740543919974992, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017226541590708566, + "loss": 1.8249, + "step": 3116 + }, + { + "epoch": 0.9743669896842764, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017224843030978705, + "loss": 1.8337, + "step": 3117 + }, + { + "epoch": 0.9746795873710534, + "grad_norm": 0.26171875, + "learning_rate": 0.00017223144035075864, + "loss": 1.7211, + "step": 3118 + }, + { + "epoch": 0.9749921850578306, + "grad_norm": 0.25, + "learning_rate": 0.00017221444603102617, + "loss": 1.7391, + "step": 3119 + }, + { + "epoch": 0.9753047827446076, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017219744735161554, + "loss": 2.0078, + "step": 3120 + }, + { + "epoch": 0.9756173804313848, + "grad_norm": 0.2412109375, + "learning_rate": 0.000172180444313553, + "loss": 1.7833, + "step": 3121 + }, + { + "epoch": 0.975929978118162, + "grad_norm": 0.251953125, + "learning_rate": 0.00017216343691786509, + "loss": 1.508, + "step": 3122 + }, + { + "epoch": 0.976242575804939, + "grad_norm": 0.251953125, + "learning_rate": 0.0001721464251655785, + "loss": 2.0652, + "step": 3123 + }, + { + "epoch": 0.9765551734917162, + "grad_norm": 0.2421875, + "learning_rate": 0.0001721294090577203, + "loss": 1.5267, + "step": 3124 + }, + { + "epoch": 0.9768677711784933, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017211238859531774, + "loss": 1.838, + "step": 3125 + }, + { + "epoch": 0.9771803688652704, + "grad_norm": 0.234375, + "learning_rate": 0.00017209536377939846, + "loss": 1.7286, + "step": 3126 + }, + { + "epoch": 0.9774929665520475, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001720783346109901, + "loss": 1.8045, + "step": 3127 + }, + { + "epoch": 0.9778055642388246, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001720613010911209, + "loss": 1.712, + "step": 3128 + }, + { + "epoch": 0.9781181619256017, + "grad_norm": 0.234375, + "learning_rate": 0.0001720442632208191, + "loss": 1.5521, + "step": 3129 + }, + { + "epoch": 0.9784307596123789, + "grad_norm": 0.23828125, + "learning_rate": 0.0001720272210011133, + "loss": 1.7718, + "step": 3130 + }, + { + "epoch": 0.978743357299156, + "grad_norm": 0.2578125, + "learning_rate": 0.00017201017443303242, + "loss": 1.4686, + "step": 3131 + }, + { + "epoch": 0.9790559549859331, + "grad_norm": 0.251953125, + "learning_rate": 0.00017199312351760555, + "loss": 1.6478, + "step": 3132 + }, + { + "epoch": 0.9793685526727102, + "grad_norm": 0.228515625, + "learning_rate": 0.00017197606825586204, + "loss": 1.4012, + "step": 3133 + }, + { + "epoch": 0.9796811503594873, + "grad_norm": 0.24609375, + "learning_rate": 0.00017195900864883158, + "loss": 1.6166, + "step": 3134 + }, + { + "epoch": 0.9799937480462645, + "grad_norm": 0.255859375, + "learning_rate": 0.00017194194469754407, + "loss": 1.7632, + "step": 3135 + }, + { + "epoch": 0.9803063457330415, + "grad_norm": 0.248046875, + "learning_rate": 0.00017192487640302969, + "loss": 1.497, + "step": 3136 + }, + { + "epoch": 0.9806189434198187, + "grad_norm": 0.25, + "learning_rate": 0.00017190780376631886, + "loss": 1.756, + "step": 3137 + }, + { + "epoch": 0.9809315411065959, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001718907267884423, + "loss": 1.5489, + "step": 3138 + }, + { + "epoch": 0.9812441387933729, + "grad_norm": 0.26171875, + "learning_rate": 0.00017187364547043091, + "loss": 1.5929, + "step": 3139 + }, + { + "epoch": 0.9815567364801501, + "grad_norm": 0.244140625, + "learning_rate": 0.000171856559813316, + "loss": 1.7889, + "step": 3140 + }, + { + "epoch": 0.9818693341669271, + "grad_norm": 0.234375, + "learning_rate": 0.00017183946981812897, + "loss": 1.4263, + "step": 3141 + }, + { + "epoch": 0.9821819318537043, + "grad_norm": 0.259765625, + "learning_rate": 0.00017182237548590162, + "loss": 1.8588, + "step": 3142 + }, + { + "epoch": 0.9824945295404814, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017180527681766593, + "loss": 1.7062, + "step": 3143 + }, + { + "epoch": 0.9828071272272585, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017178817381445418, + "loss": 1.5145, + "step": 3144 + }, + { + "epoch": 0.9831197249140357, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001717710664772989, + "loss": 1.6806, + "step": 3145 + }, + { + "epoch": 0.9834323226008127, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017175395480723286, + "loss": 1.9361, + "step": 3146 + }, + { + "epoch": 0.9837449202875899, + "grad_norm": 0.23828125, + "learning_rate": 0.00017173683880528917, + "loss": 1.5781, + "step": 3147 + }, + { + "epoch": 0.984057517974367, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017171971847250106, + "loss": 1.5337, + "step": 3148 + }, + { + "epoch": 0.9843701156611441, + "grad_norm": 0.24609375, + "learning_rate": 0.00017170259380990216, + "loss": 1.8557, + "step": 3149 + }, + { + "epoch": 0.9846827133479212, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017168546481852634, + "loss": 1.735, + "step": 3150 + }, + { + "epoch": 0.9849953110346984, + "grad_norm": 0.234375, + "learning_rate": 0.00017166833149940763, + "loss": 1.6696, + "step": 3151 + }, + { + "epoch": 0.9853079087214754, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017165119385358045, + "loss": 1.5103, + "step": 3152 + }, + { + "epoch": 0.9856205064082526, + "grad_norm": 0.232421875, + "learning_rate": 0.00017163405188207932, + "loss": 1.3137, + "step": 3153 + }, + { + "epoch": 0.9859331040950297, + "grad_norm": 0.3125, + "learning_rate": 0.00017161690558593925, + "loss": 2.1945, + "step": 3154 + }, + { + "epoch": 0.9862457017818068, + "grad_norm": 0.251953125, + "learning_rate": 0.0001715997549661953, + "loss": 1.8129, + "step": 3155 + }, + { + "epoch": 0.986558299468584, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017158260002388294, + "loss": 1.7308, + "step": 3156 + }, + { + "epoch": 0.986870897155361, + "grad_norm": 0.240234375, + "learning_rate": 0.00017156544076003778, + "loss": 1.7969, + "step": 3157 + }, + { + "epoch": 0.9871834948421382, + "grad_norm": 0.25, + "learning_rate": 0.00017154827717569577, + "loss": 1.5541, + "step": 3158 + }, + { + "epoch": 0.9874960925289152, + "grad_norm": 0.2431640625, + "learning_rate": 0.00017153110927189307, + "loss": 1.6279, + "step": 3159 + }, + { + "epoch": 0.9878086902156924, + "grad_norm": 0.2578125, + "learning_rate": 0.00017151393704966617, + "loss": 1.7777, + "step": 3160 + }, + { + "epoch": 0.9881212879024696, + "grad_norm": 0.240234375, + "learning_rate": 0.00017149676051005176, + "loss": 1.7864, + "step": 3161 + }, + { + "epoch": 0.9884338855892466, + "grad_norm": 0.263671875, + "learning_rate": 0.0001714795796540868, + "loss": 1.8507, + "step": 3162 + }, + { + "epoch": 0.9887464832760238, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017146239448280853, + "loss": 1.5787, + "step": 3163 + }, + { + "epoch": 0.9890590809628009, + "grad_norm": 0.25, + "learning_rate": 0.00017144520499725444, + "loss": 1.6532, + "step": 3164 + }, + { + "epoch": 0.989371678649578, + "grad_norm": 0.265625, + "learning_rate": 0.00017142801119846227, + "loss": 1.4543, + "step": 3165 + }, + { + "epoch": 0.9896842763363551, + "grad_norm": 0.240234375, + "learning_rate": 0.00017141081308747003, + "loss": 1.639, + "step": 3166 + }, + { + "epoch": 0.9899968740231322, + "grad_norm": 0.26171875, + "learning_rate": 0.00017139361066531605, + "loss": 1.6788, + "step": 3167 + }, + { + "epoch": 0.9903094717099094, + "grad_norm": 0.25390625, + "learning_rate": 0.00017137640393303878, + "loss": 1.5768, + "step": 3168 + }, + { + "epoch": 0.9906220693966865, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017135919289167707, + "loss": 1.6102, + "step": 3169 + }, + { + "epoch": 0.9909346670834636, + "grad_norm": 0.255859375, + "learning_rate": 0.00017134197754226996, + "loss": 1.5106, + "step": 3170 + }, + { + "epoch": 0.9912472647702407, + "grad_norm": 0.24609375, + "learning_rate": 0.00017132475788585674, + "loss": 1.4294, + "step": 3171 + }, + { + "epoch": 0.9915598624570178, + "grad_norm": 0.2490234375, + "learning_rate": 0.00017130753392347698, + "loss": 1.552, + "step": 3172 + }, + { + "epoch": 0.9918724601437949, + "grad_norm": 0.2421875, + "learning_rate": 0.00017129030565617053, + "loss": 1.4553, + "step": 3173 + }, + { + "epoch": 0.9921850578305721, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017127307308497752, + "loss": 1.6594, + "step": 3174 + }, + { + "epoch": 0.9924976555173491, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001712558362109382, + "loss": 1.7315, + "step": 3175 + }, + { + "epoch": 0.9928102532041263, + "grad_norm": 0.248046875, + "learning_rate": 0.0001712385950350933, + "loss": 1.5794, + "step": 3176 + }, + { + "epoch": 0.9931228508909034, + "grad_norm": 0.240234375, + "learning_rate": 0.0001712213495584836, + "loss": 1.7619, + "step": 3177 + }, + { + "epoch": 0.9934354485776805, + "grad_norm": 0.228515625, + "learning_rate": 0.00017120409978215034, + "loss": 1.6773, + "step": 3178 + }, + { + "epoch": 0.9937480462644577, + "grad_norm": 0.2294921875, + "learning_rate": 0.00017118684570713476, + "loss": 1.5635, + "step": 3179 + }, + { + "epoch": 0.9940606439512347, + "grad_norm": 0.25390625, + "learning_rate": 0.00017116958733447862, + "loss": 1.8061, + "step": 3180 + }, + { + "epoch": 0.9943732416380119, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017115232466522379, + "loss": 1.496, + "step": 3181 + }, + { + "epoch": 0.994685839324789, + "grad_norm": 0.283203125, + "learning_rate": 0.0001711350577004125, + "loss": 1.9932, + "step": 3182 + }, + { + "epoch": 0.9949984370115661, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017111778644108707, + "loss": 1.7719, + "step": 3183 + }, + { + "epoch": 0.9953110346983433, + "grad_norm": 0.234375, + "learning_rate": 0.00017110051088829023, + "loss": 1.9202, + "step": 3184 + }, + { + "epoch": 0.9956236323851203, + "grad_norm": 0.2392578125, + "learning_rate": 0.000171083231043065, + "loss": 1.7274, + "step": 3185 + }, + { + "epoch": 0.9959362300718975, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017106594690645454, + "loss": 1.6006, + "step": 3186 + }, + { + "epoch": 0.9962488277586746, + "grad_norm": 0.25, + "learning_rate": 0.00017104865847950224, + "loss": 1.8627, + "step": 3187 + }, + { + "epoch": 0.9965614254454517, + "grad_norm": 0.234375, + "learning_rate": 0.00017103136576325194, + "loss": 1.6147, + "step": 3188 + }, + { + "epoch": 0.9968740231322288, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017101406875874754, + "loss": 1.8255, + "step": 3189 + }, + { + "epoch": 0.9971866208190059, + "grad_norm": 0.25390625, + "learning_rate": 0.0001709967674670333, + "loss": 1.6937, + "step": 3190 + }, + { + "epoch": 0.997499218505783, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001709794618891538, + "loss": 1.7125, + "step": 3191 + }, + { + "epoch": 0.9978118161925602, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001709621520261537, + "loss": 1.7602, + "step": 3192 + }, + { + "epoch": 0.9981244138793373, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017094483787907804, + "loss": 1.8293, + "step": 3193 + }, + { + "epoch": 0.9984370115661144, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017092751944897214, + "loss": 2.0362, + "step": 3194 + }, + { + "epoch": 0.9987496092528916, + "grad_norm": 0.25, + "learning_rate": 0.00017091019673688148, + "loss": 1.8003, + "step": 3195 + }, + { + "epoch": 0.9990622069396686, + "grad_norm": 0.2353515625, + "learning_rate": 0.0001708928697438519, + "loss": 1.6969, + "step": 3196 + }, + { + "epoch": 0.9993748046264458, + "grad_norm": 0.2470703125, + "learning_rate": 0.00017087553847092943, + "loss": 1.4631, + "step": 3197 + }, + { + "epoch": 0.9996874023132228, + "grad_norm": 0.248046875, + "learning_rate": 0.0001708582029191604, + "loss": 2.0063, + "step": 3198 + }, + { + "epoch": 1.0, + "grad_norm": 0.2255859375, + "learning_rate": 0.00017084086308959132, + "loss": 1.4657, + "step": 3199 + }, + { + "epoch": 1.000312597686777, + "grad_norm": 0.24609375, + "learning_rate": 0.0001708235189832691, + "loss": 1.6879, + "step": 3200 + }, + { + "epoch": 1.000312597686777, + "eval_loss": 1.5738756656646729, + "eval_runtime": 1912.6507, + "eval_samples_per_second": 4.777, + "eval_steps_per_second": 2.389, + "step": 3200 + }, + { + "epoch": 1.0006251953735543, + "grad_norm": 0.244140625, + "learning_rate": 0.0001708061706012408, + "loss": 1.8955, + "step": 3201 + }, + { + "epoch": 1.0009377930603314, + "grad_norm": 0.232421875, + "learning_rate": 0.00017078881794455373, + "loss": 1.4155, + "step": 3202 + }, + { + "epoch": 1.0012503907471084, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001707714610142555, + "loss": 1.3249, + "step": 3203 + }, + { + "epoch": 1.0015629884338857, + "grad_norm": 0.24609375, + "learning_rate": 0.000170754099811394, + "loss": 1.8316, + "step": 3204 + }, + { + "epoch": 1.0018755861206627, + "grad_norm": 0.265625, + "learning_rate": 0.00017073673433701733, + "loss": 1.7691, + "step": 3205 + }, + { + "epoch": 1.0021881838074398, + "grad_norm": 0.25, + "learning_rate": 0.00017071936459217386, + "loss": 1.7469, + "step": 3206 + }, + { + "epoch": 1.0025007814942168, + "grad_norm": 0.25390625, + "learning_rate": 0.00017070199057791222, + "loss": 1.6787, + "step": 3207 + }, + { + "epoch": 1.0028133791809941, + "grad_norm": 0.234375, + "learning_rate": 0.00017068461229528134, + "loss": 1.601, + "step": 3208 + }, + { + "epoch": 1.0031259768677712, + "grad_norm": 0.2412109375, + "learning_rate": 0.00017066722974533032, + "loss": 1.7489, + "step": 3209 + }, + { + "epoch": 1.0034385745545482, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001706498429291086, + "loss": 1.5783, + "step": 3210 + }, + { + "epoch": 1.0037511722413255, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017063245184766585, + "loss": 1.6569, + "step": 3211 + }, + { + "epoch": 1.0040637699281025, + "grad_norm": 0.2373046875, + "learning_rate": 0.00017061505650205194, + "loss": 1.6837, + "step": 3212 + }, + { + "epoch": 1.0043763676148796, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001705976568933171, + "loss": 1.6423, + "step": 3213 + }, + { + "epoch": 1.0046889653016569, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017058025302251175, + "loss": 1.5818, + "step": 3214 + }, + { + "epoch": 1.005001562988434, + "grad_norm": 0.2392578125, + "learning_rate": 0.0001705628448906866, + "loss": 1.4841, + "step": 3215 + }, + { + "epoch": 1.005314160675211, + "grad_norm": 0.25390625, + "learning_rate": 0.00017054543249889258, + "loss": 1.5093, + "step": 3216 + }, + { + "epoch": 1.0056267583619882, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017052801584818085, + "loss": 1.6689, + "step": 3217 + }, + { + "epoch": 1.0059393560487653, + "grad_norm": 0.2421875, + "learning_rate": 0.00017051059493960297, + "loss": 1.5929, + "step": 3218 + }, + { + "epoch": 1.0062519537355423, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001704931697742106, + "loss": 1.7321, + "step": 3219 + }, + { + "epoch": 1.0065645514223194, + "grad_norm": 0.2392578125, + "learning_rate": 0.00017047574035305578, + "loss": 1.7554, + "step": 3220 + }, + { + "epoch": 1.0068771491090966, + "grad_norm": 0.2314453125, + "learning_rate": 0.00017045830667719068, + "loss": 1.5192, + "step": 3221 + }, + { + "epoch": 1.0071897467958737, + "grad_norm": 0.23046875, + "learning_rate": 0.00017044086874766783, + "loss": 1.5958, + "step": 3222 + }, + { + "epoch": 1.0075023444826507, + "grad_norm": 0.24609375, + "learning_rate": 0.00017042342656553995, + "loss": 1.6242, + "step": 3223 + }, + { + "epoch": 1.007814942169428, + "grad_norm": 0.2421875, + "learning_rate": 0.00017040598013186008, + "loss": 1.8164, + "step": 3224 + }, + { + "epoch": 1.008127539856205, + "grad_norm": 0.251953125, + "learning_rate": 0.00017038852944768152, + "loss": 1.5937, + "step": 3225 + }, + { + "epoch": 1.0084401375429821, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017037107451405768, + "loss": 1.7428, + "step": 3226 + }, + { + "epoch": 1.0087527352297594, + "grad_norm": 0.263671875, + "learning_rate": 0.00017035361533204239, + "loss": 1.6019, + "step": 3227 + }, + { + "epoch": 1.0090653329165364, + "grad_norm": 0.24609375, + "learning_rate": 0.00017033615190268972, + "loss": 1.5994, + "step": 3228 + }, + { + "epoch": 1.0093779306033135, + "grad_norm": 0.240234375, + "learning_rate": 0.00017031868422705393, + "loss": 1.6935, + "step": 3229 + }, + { + "epoch": 1.0096905282900908, + "grad_norm": 0.2333984375, + "learning_rate": 0.00017030121230618954, + "loss": 1.4656, + "step": 3230 + }, + { + "epoch": 1.0100031259768678, + "grad_norm": 0.244140625, + "learning_rate": 0.0001702837361411514, + "loss": 1.755, + "step": 3231 + }, + { + "epoch": 1.0103157236636449, + "grad_norm": 0.23828125, + "learning_rate": 0.00017026625573299454, + "loss": 1.8469, + "step": 3232 + }, + { + "epoch": 1.010628321350422, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017024877108277425, + "loss": 1.6266, + "step": 3233 + }, + { + "epoch": 1.0109409190371992, + "grad_norm": 0.25390625, + "learning_rate": 0.00017023128219154616, + "loss": 1.7379, + "step": 3234 + }, + { + "epoch": 1.0112535167239762, + "grad_norm": 0.25, + "learning_rate": 0.0001702137890603661, + "loss": 1.6069, + "step": 3235 + }, + { + "epoch": 1.000312597686777, + "grad_norm": 0.232421875, + "learning_rate": 0.00017019629169029007, + "loss": 1.7767, + "step": 3236 + }, + { + "epoch": 1.0006251953735543, + "grad_norm": 0.251953125, + "learning_rate": 0.0001701787900823745, + "loss": 1.8416, + "step": 3237 + }, + { + "epoch": 1.0009377930603314, + "grad_norm": 0.25, + "learning_rate": 0.0001701612842376759, + "loss": 1.3778, + "step": 3238 + }, + { + "epoch": 1.0012503907471084, + "grad_norm": 0.2421875, + "learning_rate": 0.00017014377415725118, + "loss": 1.7909, + "step": 3239 + }, + { + "epoch": 1.0015629884338857, + "grad_norm": 0.25, + "learning_rate": 0.0001701262598421574, + "loss": 1.6575, + "step": 3240 + }, + { + "epoch": 1.0018755861206627, + "grad_norm": 0.2353515625, + "learning_rate": 0.000170108741293452, + "loss": 1.8209, + "step": 3241 + }, + { + "epoch": 1.0021881838074398, + "grad_norm": 0.26953125, + "learning_rate": 0.00017009121851219253, + "loss": 1.6912, + "step": 3242 + }, + { + "epoch": 1.0025007814942168, + "grad_norm": 0.263671875, + "learning_rate": 0.0001700736914994369, + "loss": 1.5038, + "step": 3243 + }, + { + "epoch": 1.0028133791809941, + "grad_norm": 0.259765625, + "learning_rate": 0.00017005616025624317, + "loss": 1.364, + "step": 3244 + }, + { + "epoch": 1.0031259768677712, + "grad_norm": 0.25390625, + "learning_rate": 0.0001700386247836698, + "loss": 1.8364, + "step": 3245 + }, + { + "epoch": 1.0034385745545482, + "grad_norm": 0.265625, + "learning_rate": 0.00017002108508277542, + "loss": 1.6403, + "step": 3246 + }, + { + "epoch": 1.0037511722413255, + "grad_norm": 0.259765625, + "learning_rate": 0.00017000354115461887, + "loss": 1.8994, + "step": 3247 + }, + { + "epoch": 1.0040637699281025, + "grad_norm": 0.24609375, + "learning_rate": 0.00016998599300025938, + "loss": 1.6125, + "step": 3248 + }, + { + "epoch": 1.0043763676148796, + "grad_norm": 0.267578125, + "learning_rate": 0.00016996844062075624, + "loss": 1.7269, + "step": 3249 + }, + { + "epoch": 1.0046889653016569, + "grad_norm": 0.25390625, + "learning_rate": 0.00016995088401716924, + "loss": 1.4252, + "step": 3250 + }, + { + "epoch": 1.005001562988434, + "grad_norm": 0.248046875, + "learning_rate": 0.00016993332319055818, + "loss": 1.5086, + "step": 3251 + }, + { + "epoch": 1.005314160675211, + "grad_norm": 0.26171875, + "learning_rate": 0.00016991575814198333, + "loss": 1.702, + "step": 3252 + }, + { + "epoch": 1.0056267583619882, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016989818887250504, + "loss": 1.635, + "step": 3253 + }, + { + "epoch": 1.0059393560487653, + "grad_norm": 0.25, + "learning_rate": 0.00016988061538318402, + "loss": 1.7163, + "step": 3254 + }, + { + "epoch": 1.0062519537355423, + "grad_norm": 0.23828125, + "learning_rate": 0.0001698630376750812, + "loss": 1.5299, + "step": 3255 + }, + { + "epoch": 1.0065645514223194, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001698454557492578, + "loss": 2.0135, + "step": 3256 + }, + { + "epoch": 1.0068771491090966, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016982786960677518, + "loss": 1.4811, + "step": 3257 + }, + { + "epoch": 1.0071897467958737, + "grad_norm": 0.36328125, + "learning_rate": 0.0001698102792486951, + "loss": 2.2739, + "step": 3258 + }, + { + "epoch": 1.0075023444826507, + "grad_norm": 0.251953125, + "learning_rate": 0.00016979268467607952, + "loss": 1.3993, + "step": 3259 + }, + { + "epoch": 1.007814942169428, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016977508588999062, + "loss": 1.4281, + "step": 3260 + }, + { + "epoch": 1.008127539856205, + "grad_norm": 0.255859375, + "learning_rate": 0.00016975748289149088, + "loss": 1.4793, + "step": 3261 + }, + { + "epoch": 1.0084401375429821, + "grad_norm": 0.26171875, + "learning_rate": 0.00016973987568164297, + "loss": 1.8675, + "step": 3262 + }, + { + "epoch": 1.0087527352297594, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016972226426150994, + "loss": 1.5013, + "step": 3263 + }, + { + "epoch": 1.0090653329165364, + "grad_norm": 0.255859375, + "learning_rate": 0.00016970464863215495, + "loss": 1.6938, + "step": 3264 + }, + { + "epoch": 1.0093779306033135, + "grad_norm": 0.255859375, + "learning_rate": 0.0001696870287946415, + "loss": 1.4364, + "step": 3265 + }, + { + "epoch": 1.0096905282900908, + "grad_norm": 0.279296875, + "learning_rate": 0.0001696694047500333, + "loss": 1.6278, + "step": 3266 + }, + { + "epoch": 1.0100031259768678, + "grad_norm": 0.263671875, + "learning_rate": 0.0001696517764993944, + "loss": 2.03, + "step": 3267 + }, + { + "epoch": 1.0103157236636449, + "grad_norm": 0.263671875, + "learning_rate": 0.0001696341440437889, + "loss": 1.6937, + "step": 3268 + }, + { + "epoch": 1.010628321350422, + "grad_norm": 0.2578125, + "learning_rate": 0.00016961650738428146, + "loss": 1.6141, + "step": 3269 + }, + { + "epoch": 1.0109409190371992, + "grad_norm": 0.259765625, + "learning_rate": 0.00016959886652193678, + "loss": 1.5632, + "step": 3270 + }, + { + "epoch": 1.0112535167239762, + "grad_norm": 0.25390625, + "learning_rate": 0.0001695812214578198, + "loss": 1.8156, + "step": 3271 + }, + { + "epoch": 1.0115661144107533, + "grad_norm": 0.2353515625, + "learning_rate": 0.00016956357219299583, + "loss": 1.5758, + "step": 3272 + }, + { + "epoch": 1.0118787120975306, + "grad_norm": 0.25, + "learning_rate": 0.00016954591872853035, + "loss": 1.3459, + "step": 3273 + }, + { + "epoch": 1.0121913097843076, + "grad_norm": 0.244140625, + "learning_rate": 0.0001695282610654891, + "loss": 1.4463, + "step": 3274 + }, + { + "epoch": 1.0125039074710847, + "grad_norm": 0.25390625, + "learning_rate": 0.00016951059920493816, + "loss": 1.4842, + "step": 3275 + }, + { + "epoch": 1.012816505157862, + "grad_norm": 0.2734375, + "learning_rate": 0.0001694929331479438, + "loss": 1.8982, + "step": 3276 + }, + { + "epoch": 1.013129102844639, + "grad_norm": 0.2421875, + "learning_rate": 0.00016947526289557246, + "loss": 1.5836, + "step": 3277 + }, + { + "epoch": 1.013441700531416, + "grad_norm": 0.263671875, + "learning_rate": 0.000169457588448891, + "loss": 1.659, + "step": 3278 + }, + { + "epoch": 1.0137542982181933, + "grad_norm": 0.2578125, + "learning_rate": 0.00016943990980896638, + "loss": 1.8123, + "step": 3279 + }, + { + "epoch": 1.0140668959049703, + "grad_norm": 0.275390625, + "learning_rate": 0.00016942222697686595, + "loss": 1.9833, + "step": 3280 + }, + { + "epoch": 1.0143794935917474, + "grad_norm": 0.2734375, + "learning_rate": 0.0001694045399536572, + "loss": 1.414, + "step": 3281 + }, + { + "epoch": 1.0146920912785244, + "grad_norm": 0.26171875, + "learning_rate": 0.00016938684874040792, + "loss": 1.8165, + "step": 3282 + }, + { + "epoch": 1.0150046889653017, + "grad_norm": 0.23828125, + "learning_rate": 0.00016936915333818617, + "loss": 1.5251, + "step": 3283 + }, + { + "epoch": 1.0153172866520788, + "grad_norm": 0.265625, + "learning_rate": 0.00016935145374806024, + "loss": 1.4809, + "step": 3284 + }, + { + "epoch": 1.0156298843388558, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016933374997109865, + "loss": 1.3905, + "step": 3285 + }, + { + "epoch": 1.015942482025633, + "grad_norm": 0.251953125, + "learning_rate": 0.00016931604200837023, + "loss": 1.7941, + "step": 3286 + }, + { + "epoch": 1.0162550797124101, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016929832986094402, + "loss": 1.547, + "step": 3287 + }, + { + "epoch": 1.0165676773991872, + "grad_norm": 0.24609375, + "learning_rate": 0.00016928061352988936, + "loss": 1.6516, + "step": 3288 + }, + { + "epoch": 1.0168802750859645, + "grad_norm": 0.2373046875, + "learning_rate": 0.00016926289301627575, + "loss": 1.6519, + "step": 3289 + }, + { + "epoch": 1.0171928727727415, + "grad_norm": 0.255859375, + "learning_rate": 0.000169245168321173, + "loss": 1.6238, + "step": 3290 + }, + { + "epoch": 1.0175054704595186, + "grad_norm": 0.26171875, + "learning_rate": 0.00016922743944565124, + "loss": 1.6475, + "step": 3291 + }, + { + "epoch": 1.0178180681462958, + "grad_norm": 0.25390625, + "learning_rate": 0.00016920970639078075, + "loss": 1.7816, + "step": 3292 + }, + { + "epoch": 1.0181306658330729, + "grad_norm": 0.259765625, + "learning_rate": 0.00016919196915763206, + "loss": 1.612, + "step": 3293 + }, + { + "epoch": 1.01844326351985, + "grad_norm": 0.275390625, + "learning_rate": 0.00016917422774727602, + "loss": 1.6574, + "step": 3294 + }, + { + "epoch": 1.018755861206627, + "grad_norm": 0.26953125, + "learning_rate": 0.00016915648216078374, + "loss": 1.698, + "step": 3295 + }, + { + "epoch": 1.0190684588934042, + "grad_norm": 0.25, + "learning_rate": 0.0001691387323992265, + "loss": 1.3921, + "step": 3296 + }, + { + "epoch": 1.0193810565801813, + "grad_norm": 0.259765625, + "learning_rate": 0.00016912097846367584, + "loss": 1.5295, + "step": 3297 + }, + { + "epoch": 1.0196936542669583, + "grad_norm": 0.240234375, + "learning_rate": 0.00016910322035520363, + "loss": 1.5175, + "step": 3298 + }, + { + "epoch": 1.0200062519537356, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016908545807488195, + "loss": 1.8336, + "step": 3299 + }, + { + "epoch": 1.0203188496405127, + "grad_norm": 0.25, + "learning_rate": 0.00016906769162378316, + "loss": 1.6812, + "step": 3300 + }, + { + "epoch": 1.0206314473272897, + "grad_norm": 0.251953125, + "learning_rate": 0.0001690499210029798, + "loss": 1.4225, + "step": 3301 + }, + { + "epoch": 1.020944045014067, + "grad_norm": 0.26171875, + "learning_rate": 0.00016903214621354472, + "loss": 1.5756, + "step": 3302 + }, + { + "epoch": 1.021256642700844, + "grad_norm": 0.2578125, + "learning_rate": 0.000169014367256551, + "loss": 1.7286, + "step": 3303 + }, + { + "epoch": 1.021569240387621, + "grad_norm": 0.255859375, + "learning_rate": 0.00016899658413307197, + "loss": 1.462, + "step": 3304 + }, + { + "epoch": 1.0218818380743981, + "grad_norm": 0.25, + "learning_rate": 0.00016897879684418126, + "loss": 1.6298, + "step": 3305 + }, + { + "epoch": 1.0221944357611754, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016896100539095266, + "loss": 1.5951, + "step": 3306 + }, + { + "epoch": 1.0225070334479525, + "grad_norm": 0.25, + "learning_rate": 0.00016894320977446032, + "loss": 1.6417, + "step": 3307 + }, + { + "epoch": 1.0228196311347295, + "grad_norm": 0.25390625, + "learning_rate": 0.0001689254099957785, + "loss": 1.5105, + "step": 3308 + }, + { + "epoch": 1.0231322288215068, + "grad_norm": 0.259765625, + "learning_rate": 0.0001689076060559819, + "loss": 1.6498, + "step": 3309 + }, + { + "epoch": 1.0234448265082838, + "grad_norm": 0.3203125, + "learning_rate": 0.00016888979795614525, + "loss": 2.3866, + "step": 3310 + }, + { + "epoch": 1.0237574241950609, + "grad_norm": 0.2421875, + "learning_rate": 0.00016887198569734375, + "loss": 1.6526, + "step": 3311 + }, + { + "epoch": 1.0240700218818382, + "grad_norm": 0.251953125, + "learning_rate": 0.00016885416928065272, + "loss": 1.6504, + "step": 3312 + }, + { + "epoch": 1.0243826195686152, + "grad_norm": 0.26953125, + "learning_rate": 0.00016883634870714772, + "loss": 1.5203, + "step": 3313 + }, + { + "epoch": 1.0246952172553923, + "grad_norm": 0.2578125, + "learning_rate": 0.00016881852397790465, + "loss": 1.3883, + "step": 3314 + }, + { + "epoch": 1.0250078149421695, + "grad_norm": 0.24609375, + "learning_rate": 0.0001688006950939996, + "loss": 1.419, + "step": 3315 + }, + { + "epoch": 1.0253204126289466, + "grad_norm": 0.30859375, + "learning_rate": 0.00016878286205650888, + "loss": 2.3922, + "step": 3316 + }, + { + "epoch": 1.0256330103157236, + "grad_norm": 0.251953125, + "learning_rate": 0.00016876502486650914, + "loss": 1.4724, + "step": 3317 + }, + { + "epoch": 1.0259456080025007, + "grad_norm": 0.25390625, + "learning_rate": 0.0001687471835250772, + "loss": 1.541, + "step": 3318 + }, + { + "epoch": 1.026258205689278, + "grad_norm": 0.25390625, + "learning_rate": 0.00016872933803329025, + "loss": 1.6618, + "step": 3319 + }, + { + "epoch": 1.026570803376055, + "grad_norm": 0.251953125, + "learning_rate": 0.00016871148839222552, + "loss": 1.8275, + "step": 3320 + }, + { + "epoch": 1.026883401062832, + "grad_norm": 0.2578125, + "learning_rate": 0.0001686936346029607, + "loss": 1.7057, + "step": 3321 + }, + { + "epoch": 1.0271959987496093, + "grad_norm": 0.259765625, + "learning_rate": 0.00016867577666657363, + "loss": 1.5628, + "step": 3322 + }, + { + "epoch": 1.0275085964363864, + "grad_norm": 0.255859375, + "learning_rate": 0.0001686579145841424, + "loss": 1.5199, + "step": 3323 + }, + { + "epoch": 1.0278211941231634, + "grad_norm": 0.255859375, + "learning_rate": 0.00016864004835674535, + "loss": 1.1948, + "step": 3324 + }, + { + "epoch": 1.0281337918099407, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016862217798546117, + "loss": 1.569, + "step": 3325 + }, + { + "epoch": 1.0284463894967177, + "grad_norm": 0.26171875, + "learning_rate": 0.0001686043034713686, + "loss": 1.5452, + "step": 3326 + }, + { + "epoch": 1.0287589871834948, + "grad_norm": 0.255859375, + "learning_rate": 0.00016858642481554684, + "loss": 1.5146, + "step": 3327 + }, + { + "epoch": 1.029071584870272, + "grad_norm": 0.24609375, + "learning_rate": 0.0001685685420190752, + "loss": 1.7647, + "step": 3328 + }, + { + "epoch": 1.029384182557049, + "grad_norm": 0.26171875, + "learning_rate": 0.0001685506550830333, + "loss": 1.705, + "step": 3329 + }, + { + "epoch": 1.0296967802438262, + "grad_norm": 0.25, + "learning_rate": 0.000168532764008501, + "loss": 1.7709, + "step": 3330 + }, + { + "epoch": 1.0300093779306032, + "grad_norm": 0.263671875, + "learning_rate": 0.0001685148687965584, + "loss": 1.3867, + "step": 3331 + }, + { + "epoch": 1.0303219756173805, + "grad_norm": 0.2578125, + "learning_rate": 0.00016849696944828586, + "loss": 1.5167, + "step": 3332 + }, + { + "epoch": 1.0306345733041575, + "grad_norm": 0.26953125, + "learning_rate": 0.000168479065964764, + "loss": 1.7646, + "step": 3333 + }, + { + "epoch": 1.0309471709909346, + "grad_norm": 0.279296875, + "learning_rate": 0.00016846115834707367, + "loss": 1.7008, + "step": 3334 + }, + { + "epoch": 1.0312597686777119, + "grad_norm": 0.2578125, + "learning_rate": 0.00016844324659629595, + "loss": 1.3537, + "step": 3335 + }, + { + "epoch": 1.031572366364489, + "grad_norm": 0.24609375, + "learning_rate": 0.00016842533071351223, + "loss": 1.2803, + "step": 3336 + }, + { + "epoch": 1.031884964051266, + "grad_norm": 0.26171875, + "learning_rate": 0.00016840741069980408, + "loss": 1.6595, + "step": 3337 + }, + { + "epoch": 1.0321975617380432, + "grad_norm": 0.263671875, + "learning_rate": 0.00016838948655625337, + "loss": 2.0696, + "step": 3338 + }, + { + "epoch": 1.0325101594248203, + "grad_norm": 0.240234375, + "learning_rate": 0.00016837155828394225, + "loss": 1.5362, + "step": 3339 + }, + { + "epoch": 1.0328227571115973, + "grad_norm": 0.2470703125, + "learning_rate": 0.000168353625883953, + "loss": 1.4373, + "step": 3340 + }, + { + "epoch": 1.0331353547983746, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016833568935736826, + "loss": 1.2943, + "step": 3341 + }, + { + "epoch": 1.0334479524851516, + "grad_norm": 0.271484375, + "learning_rate": 0.00016831774870527087, + "loss": 1.6662, + "step": 3342 + }, + { + "epoch": 1.0337605501719287, + "grad_norm": 0.2412109375, + "learning_rate": 0.00016829980392874392, + "loss": 1.5353, + "step": 3343 + }, + { + "epoch": 1.0340731478587057, + "grad_norm": 0.25390625, + "learning_rate": 0.00016828185502887078, + "loss": 1.6163, + "step": 3344 + }, + { + "epoch": 1.034385745545483, + "grad_norm": 0.25390625, + "learning_rate": 0.00016826390200673502, + "loss": 1.3735, + "step": 3345 + }, + { + "epoch": 1.03469834323226, + "grad_norm": 0.255859375, + "learning_rate": 0.00016824594486342052, + "loss": 1.6119, + "step": 3346 + }, + { + "epoch": 1.0350109409190371, + "grad_norm": 0.255859375, + "learning_rate": 0.00016822798360001138, + "loss": 1.8152, + "step": 3347 + }, + { + "epoch": 1.0353235386058144, + "grad_norm": 0.2578125, + "learning_rate": 0.00016821001821759192, + "loss": 1.61, + "step": 3348 + }, + { + "epoch": 1.0356361362925914, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001681920487172467, + "loss": 1.6514, + "step": 3349 + }, + { + "epoch": 1.0359487339793685, + "grad_norm": 0.24609375, + "learning_rate": 0.00016817407510006066, + "loss": 1.6709, + "step": 3350 + }, + { + "epoch": 1.0362613316661458, + "grad_norm": 0.265625, + "learning_rate": 0.00016815609736711882, + "loss": 1.7892, + "step": 3351 + }, + { + "epoch": 1.0365739293529228, + "grad_norm": 0.26171875, + "learning_rate": 0.0001681381155195065, + "loss": 1.4647, + "step": 3352 + }, + { + "epoch": 1.0368865270396999, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016812012955830935, + "loss": 1.5796, + "step": 3353 + }, + { + "epoch": 1.0371991247264771, + "grad_norm": 0.255859375, + "learning_rate": 0.00016810213948461315, + "loss": 1.525, + "step": 3354 + }, + { + "epoch": 1.0375117224132542, + "grad_norm": 0.2578125, + "learning_rate": 0.000168084145299504, + "loss": 1.6335, + "step": 3355 + }, + { + "epoch": 1.0378243201000312, + "grad_norm": 0.26171875, + "learning_rate": 0.00016806614700406826, + "loss": 1.5538, + "step": 3356 + }, + { + "epoch": 1.0381369177868083, + "grad_norm": 0.24609375, + "learning_rate": 0.00016804814459939248, + "loss": 1.7529, + "step": 3357 + }, + { + "epoch": 1.0384495154735855, + "grad_norm": 0.25390625, + "learning_rate": 0.00016803013808656348, + "loss": 1.9922, + "step": 3358 + }, + { + "epoch": 1.0387621131603626, + "grad_norm": 0.2734375, + "learning_rate": 0.00016801212746666834, + "loss": 1.807, + "step": 3359 + }, + { + "epoch": 1.0390747108471396, + "grad_norm": 0.2578125, + "learning_rate": 0.00016799411274079446, + "loss": 1.7129, + "step": 3360 + }, + { + "epoch": 1.039387308533917, + "grad_norm": 0.255859375, + "learning_rate": 0.00016797609391002932, + "loss": 1.6807, + "step": 3361 + }, + { + "epoch": 1.039699906220694, + "grad_norm": 0.255859375, + "learning_rate": 0.00016795807097546073, + "loss": 1.5031, + "step": 3362 + }, + { + "epoch": 1.040012503907471, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016794004393817682, + "loss": 1.4893, + "step": 3363 + }, + { + "epoch": 1.0403251015942483, + "grad_norm": 0.248046875, + "learning_rate": 0.00016792201279926586, + "loss": 1.8596, + "step": 3364 + }, + { + "epoch": 1.0406376992810253, + "grad_norm": 0.25390625, + "learning_rate": 0.0001679039775598165, + "loss": 1.5344, + "step": 3365 + }, + { + "epoch": 1.0409502969678024, + "grad_norm": 0.27734375, + "learning_rate": 0.00016788593822091743, + "loss": 1.5466, + "step": 3366 + }, + { + "epoch": 1.0412628946545797, + "grad_norm": 0.265625, + "learning_rate": 0.00016786789478365777, + "loss": 1.5067, + "step": 3367 + }, + { + "epoch": 1.0415754923413567, + "grad_norm": 0.25, + "learning_rate": 0.0001678498472491268, + "loss": 1.5952, + "step": 3368 + }, + { + "epoch": 1.0418880900281338, + "grad_norm": 0.255859375, + "learning_rate": 0.0001678317956184141, + "loss": 1.3462, + "step": 3369 + }, + { + "epoch": 1.0422006877149108, + "grad_norm": 0.265625, + "learning_rate": 0.0001678137398926095, + "loss": 1.9309, + "step": 3370 + }, + { + "epoch": 1.042513285401688, + "grad_norm": 0.2431640625, + "learning_rate": 0.00016779568007280294, + "loss": 1.3741, + "step": 3371 + }, + { + "epoch": 1.0428258830884651, + "grad_norm": 0.255859375, + "learning_rate": 0.00016777761616008482, + "loss": 1.6094, + "step": 3372 + }, + { + "epoch": 1.0431384807752422, + "grad_norm": 0.25390625, + "learning_rate": 0.00016775954815554563, + "loss": 1.5275, + "step": 3373 + }, + { + "epoch": 1.0434510784620195, + "grad_norm": 0.259765625, + "learning_rate": 0.0001677414760602762, + "loss": 1.6487, + "step": 3374 + }, + { + "epoch": 1.0437636761487965, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001677233998753675, + "loss": 1.7877, + "step": 3375 + }, + { + "epoch": 1.0440762738355736, + "grad_norm": 0.259765625, + "learning_rate": 0.00016770531960191086, + "loss": 1.8232, + "step": 3376 + }, + { + "epoch": 1.0443888715223508, + "grad_norm": 0.251953125, + "learning_rate": 0.00016768723524099782, + "loss": 1.5395, + "step": 3377 + }, + { + "epoch": 1.0447014692091279, + "grad_norm": 0.251953125, + "learning_rate": 0.0001676691467937201, + "loss": 1.5682, + "step": 3378 + }, + { + "epoch": 1.045014066895905, + "grad_norm": 0.25390625, + "learning_rate": 0.00016765105426116977, + "loss": 1.9006, + "step": 3379 + }, + { + "epoch": 1.045326664582682, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001676329576444391, + "loss": 1.7142, + "step": 3380 + }, + { + "epoch": 1.0456392622694592, + "grad_norm": 0.26171875, + "learning_rate": 0.00016761485694462058, + "loss": 1.502, + "step": 3381 + }, + { + "epoch": 1.0459518599562363, + "grad_norm": 0.25, + "learning_rate": 0.00016759675216280697, + "loss": 1.5799, + "step": 3382 + }, + { + "epoch": 1.0462644576430133, + "grad_norm": 0.283203125, + "learning_rate": 0.00016757864330009132, + "loss": 1.3851, + "step": 3383 + }, + { + "epoch": 1.0465770553297906, + "grad_norm": 0.2451171875, + "learning_rate": 0.0001675605303575668, + "loss": 1.747, + "step": 3384 + }, + { + "epoch": 1.0468896530165677, + "grad_norm": 0.26171875, + "learning_rate": 0.00016754241333632704, + "loss": 1.5268, + "step": 3385 + }, + { + "epoch": 1.0472022507033447, + "grad_norm": 0.25390625, + "learning_rate": 0.0001675242922374657, + "loss": 1.8177, + "step": 3386 + }, + { + "epoch": 1.047514848390122, + "grad_norm": 0.23828125, + "learning_rate": 0.00016750616706207678, + "loss": 1.841, + "step": 3387 + }, + { + "epoch": 1.047827446076899, + "grad_norm": 0.25390625, + "learning_rate": 0.00016748803781125454, + "loss": 1.6151, + "step": 3388 + }, + { + "epoch": 1.048140043763676, + "grad_norm": 0.255859375, + "learning_rate": 0.00016746990448609342, + "loss": 1.5231, + "step": 3389 + }, + { + "epoch": 1.0484526414504534, + "grad_norm": 0.26171875, + "learning_rate": 0.00016745176708768823, + "loss": 1.5842, + "step": 3390 + }, + { + "epoch": 1.0487652391372304, + "grad_norm": 0.263671875, + "learning_rate": 0.00016743362561713387, + "loss": 1.5161, + "step": 3391 + }, + { + "epoch": 1.0490778368240075, + "grad_norm": 0.25390625, + "learning_rate": 0.00016741548007552566, + "loss": 1.5454, + "step": 3392 + }, + { + "epoch": 1.0493904345107845, + "grad_norm": 0.271484375, + "learning_rate": 0.00016739733046395894, + "loss": 1.5117, + "step": 3393 + }, + { + "epoch": 1.0497030321975618, + "grad_norm": 0.26171875, + "learning_rate": 0.00016737917678352954, + "loss": 1.7243, + "step": 3394 + }, + { + "epoch": 1.0500156298843388, + "grad_norm": 0.265625, + "learning_rate": 0.00016736101903533335, + "loss": 1.7013, + "step": 3395 + }, + { + "epoch": 1.0503282275711159, + "grad_norm": 0.267578125, + "learning_rate": 0.00016734285722046663, + "loss": 1.6861, + "step": 3396 + }, + { + "epoch": 1.0506408252578932, + "grad_norm": 0.244140625, + "learning_rate": 0.00016732469134002578, + "loss": 1.5176, + "step": 3397 + }, + { + "epoch": 1.0509534229446702, + "grad_norm": 0.2734375, + "learning_rate": 0.00016730652139510752, + "loss": 1.4484, + "step": 3398 + }, + { + "epoch": 1.0512660206314473, + "grad_norm": 0.25390625, + "learning_rate": 0.0001672883473868088, + "loss": 1.6714, + "step": 3399 + }, + { + "epoch": 1.0515786183182245, + "grad_norm": 0.2578125, + "learning_rate": 0.0001672701693162268, + "loss": 1.7261, + "step": 3400 + }, + { + "epoch": 1.0518912160050016, + "grad_norm": 0.263671875, + "learning_rate": 0.00016725198718445898, + "loss": 1.7724, + "step": 3401 + }, + { + "epoch": 1.0522038136917786, + "grad_norm": 0.26171875, + "learning_rate": 0.00016723380099260295, + "loss": 1.6345, + "step": 3402 + }, + { + "epoch": 1.052516411378556, + "grad_norm": 0.265625, + "learning_rate": 0.00016721561074175672, + "loss": 1.7645, + "step": 3403 + }, + { + "epoch": 1.052829009065333, + "grad_norm": 0.251953125, + "learning_rate": 0.0001671974164330184, + "loss": 1.4015, + "step": 3404 + }, + { + "epoch": 1.05314160675211, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001671792180674864, + "loss": 1.6586, + "step": 3405 + }, + { + "epoch": 1.053454204438887, + "grad_norm": 0.25, + "learning_rate": 0.0001671610156462594, + "loss": 1.7158, + "step": 3406 + }, + { + "epoch": 1.0537668021256643, + "grad_norm": 0.25, + "learning_rate": 0.0001671428091704363, + "loss": 1.615, + "step": 3407 + }, + { + "epoch": 1.0540793998124414, + "grad_norm": 0.2578125, + "learning_rate": 0.0001671245986411163, + "loss": 1.9367, + "step": 3408 + }, + { + "epoch": 1.0543919974992184, + "grad_norm": 0.25390625, + "learning_rate": 0.00016710638405939866, + "loss": 1.6939, + "step": 3409 + }, + { + "epoch": 1.0547045951859957, + "grad_norm": 0.25390625, + "learning_rate": 0.00016708816542638317, + "loss": 1.4502, + "step": 3410 + }, + { + "epoch": 1.0550171928727727, + "grad_norm": 0.265625, + "learning_rate": 0.00016706994274316963, + "loss": 1.4356, + "step": 3411 + }, + { + "epoch": 1.0553297905595498, + "grad_norm": 0.255859375, + "learning_rate": 0.00016705171601085814, + "loss": 1.672, + "step": 3412 + }, + { + "epoch": 1.055642388246327, + "grad_norm": 0.263671875, + "learning_rate": 0.00016703348523054915, + "loss": 1.6083, + "step": 3413 + }, + { + "epoch": 1.055954985933104, + "grad_norm": 0.26953125, + "learning_rate": 0.00016701525040334323, + "loss": 1.3438, + "step": 3414 + }, + { + "epoch": 1.0562675836198812, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016699701153034122, + "loss": 1.5055, + "step": 3415 + }, + { + "epoch": 1.0565801813066584, + "grad_norm": 0.267578125, + "learning_rate": 0.00016697876861264426, + "loss": 1.6327, + "step": 3416 + }, + { + "epoch": 1.0568927789934355, + "grad_norm": 0.26171875, + "learning_rate": 0.0001669605216513537, + "loss": 1.4462, + "step": 3417 + }, + { + "epoch": 1.0572053766802125, + "grad_norm": 0.248046875, + "learning_rate": 0.0001669422706475711, + "loss": 1.4378, + "step": 3418 + }, + { + "epoch": 1.0575179743669896, + "grad_norm": 0.240234375, + "learning_rate": 0.00016692401560239835, + "loss": 1.7001, + "step": 3419 + }, + { + "epoch": 1.0578305720537668, + "grad_norm": 0.2451171875, + "learning_rate": 0.00016690575651693746, + "loss": 1.5977, + "step": 3420 + }, + { + "epoch": 1.058143169740544, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016688749339229079, + "loss": 1.5273, + "step": 3421 + }, + { + "epoch": 1.058455767427321, + "grad_norm": 0.26171875, + "learning_rate": 0.00016686922622956093, + "loss": 1.6957, + "step": 3422 + }, + { + "epoch": 1.0587683651140982, + "grad_norm": 0.287109375, + "learning_rate": 0.00016685095502985066, + "loss": 2.3469, + "step": 3423 + }, + { + "epoch": 1.0590809628008753, + "grad_norm": 0.267578125, + "learning_rate": 0.000166832679794263, + "loss": 2.0036, + "step": 3424 + }, + { + "epoch": 1.0593935604876523, + "grad_norm": 0.275390625, + "learning_rate": 0.0001668144005239014, + "loss": 1.606, + "step": 3425 + }, + { + "epoch": 1.0597061581744296, + "grad_norm": 0.26171875, + "learning_rate": 0.00016679611721986923, + "loss": 1.5605, + "step": 3426 + }, + { + "epoch": 1.0600187558612066, + "grad_norm": 0.2421875, + "learning_rate": 0.00016677782988327032, + "loss": 1.4457, + "step": 3427 + }, + { + "epoch": 1.0603313535479837, + "grad_norm": 0.259765625, + "learning_rate": 0.00016675953851520876, + "loss": 1.6452, + "step": 3428 + }, + { + "epoch": 1.060643951234761, + "grad_norm": 0.294921875, + "learning_rate": 0.00016674124311678878, + "loss": 2.1626, + "step": 3429 + }, + { + "epoch": 1.060956548921538, + "grad_norm": 0.259765625, + "learning_rate": 0.00016672294368911495, + "loss": 1.6135, + "step": 3430 + }, + { + "epoch": 1.061269146608315, + "grad_norm": 0.263671875, + "learning_rate": 0.00016670464023329194, + "loss": 1.6071, + "step": 3431 + }, + { + "epoch": 1.0615817442950921, + "grad_norm": 0.2734375, + "learning_rate": 0.00016668633275042477, + "loss": 1.8194, + "step": 3432 + }, + { + "epoch": 1.0618943419818694, + "grad_norm": 0.26171875, + "learning_rate": 0.00016666802124161876, + "loss": 1.3389, + "step": 3433 + }, + { + "epoch": 1.0622069396686464, + "grad_norm": 0.248046875, + "learning_rate": 0.0001666497057079793, + "loss": 1.832, + "step": 3434 + }, + { + "epoch": 1.0625195373554235, + "grad_norm": 0.26171875, + "learning_rate": 0.00016663138615061223, + "loss": 1.5173, + "step": 3435 + }, + { + "epoch": 1.0628321350422008, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016661306257062346, + "loss": 1.5316, + "step": 3436 + }, + { + "epoch": 1.0631447327289778, + "grad_norm": 0.2470703125, + "learning_rate": 0.0001665947349691192, + "loss": 1.8877, + "step": 3437 + }, + { + "epoch": 1.0634573304157549, + "grad_norm": 0.2578125, + "learning_rate": 0.00016657640334720594, + "loss": 1.6651, + "step": 3438 + }, + { + "epoch": 1.0637699281025321, + "grad_norm": 0.263671875, + "learning_rate": 0.00016655806770599034, + "loss": 1.3243, + "step": 3439 + }, + { + "epoch": 1.0640825257893092, + "grad_norm": 0.251953125, + "learning_rate": 0.00016653972804657938, + "loss": 1.704, + "step": 3440 + }, + { + "epoch": 1.0643951234760862, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016652138437008027, + "loss": 1.4222, + "step": 3441 + }, + { + "epoch": 1.0647077211628635, + "grad_norm": 0.251953125, + "learning_rate": 0.00016650303667760044, + "loss": 1.7757, + "step": 3442 + }, + { + "epoch": 1.0650203188496405, + "grad_norm": 0.267578125, + "learning_rate": 0.0001664846849702475, + "loss": 1.7258, + "step": 3443 + }, + { + "epoch": 1.0653329165364176, + "grad_norm": 0.263671875, + "learning_rate": 0.00016646632924912939, + "loss": 1.7039, + "step": 3444 + }, + { + "epoch": 1.0656455142231946, + "grad_norm": 0.25390625, + "learning_rate": 0.00016644796951535432, + "loss": 1.6286, + "step": 3445 + }, + { + "epoch": 1.065958111909972, + "grad_norm": 0.263671875, + "learning_rate": 0.00016642960577003066, + "loss": 1.4467, + "step": 3446 + }, + { + "epoch": 1.066270709596749, + "grad_norm": 0.275390625, + "learning_rate": 0.000166411238014267, + "loss": 1.5944, + "step": 3447 + }, + { + "epoch": 1.066583307283526, + "grad_norm": 0.24609375, + "learning_rate": 0.00016639286624917232, + "loss": 1.7422, + "step": 3448 + }, + { + "epoch": 1.0668959049703033, + "grad_norm": 0.251953125, + "learning_rate": 0.00016637449047585568, + "loss": 1.6975, + "step": 3449 + }, + { + "epoch": 1.0672085026570803, + "grad_norm": 0.248046875, + "learning_rate": 0.00016635611069542648, + "loss": 1.5701, + "step": 3450 + }, + { + "epoch": 1.0675211003438574, + "grad_norm": 0.24609375, + "learning_rate": 0.0001663377269089943, + "loss": 1.7853, + "step": 3451 + }, + { + "epoch": 1.0678336980306347, + "grad_norm": 0.2578125, + "learning_rate": 0.00016631933911766904, + "loss": 1.5712, + "step": 3452 + }, + { + "epoch": 1.0681462957174117, + "grad_norm": 0.25390625, + "learning_rate": 0.00016630094732256073, + "loss": 1.6186, + "step": 3453 + }, + { + "epoch": 1.0684588934041888, + "grad_norm": 0.275390625, + "learning_rate": 0.00016628255152477978, + "loss": 1.5015, + "step": 3454 + }, + { + "epoch": 1.068771491090966, + "grad_norm": 0.25390625, + "learning_rate": 0.00016626415172543672, + "loss": 1.6665, + "step": 3455 + }, + { + "epoch": 1.069084088777743, + "grad_norm": 0.255859375, + "learning_rate": 0.00016624574792564235, + "loss": 1.7477, + "step": 3456 + }, + { + "epoch": 1.0693966864645201, + "grad_norm": 0.26953125, + "learning_rate": 0.0001662273401265078, + "loss": 1.7283, + "step": 3457 + }, + { + "epoch": 1.0697092841512972, + "grad_norm": 0.263671875, + "learning_rate": 0.0001662089283291443, + "loss": 1.5704, + "step": 3458 + }, + { + "epoch": 1.0700218818380745, + "grad_norm": 0.26953125, + "learning_rate": 0.00016619051253466343, + "loss": 1.5737, + "step": 3459 + }, + { + "epoch": 1.0703344795248515, + "grad_norm": 0.2578125, + "learning_rate": 0.000166172092744177, + "loss": 1.4126, + "step": 3460 + }, + { + "epoch": 1.0706470772116286, + "grad_norm": 0.265625, + "learning_rate": 0.000166153668958797, + "loss": 1.5957, + "step": 3461 + }, + { + "epoch": 1.0709596748984058, + "grad_norm": 0.2734375, + "learning_rate": 0.00016613524117963565, + "loss": 1.8575, + "step": 3462 + }, + { + "epoch": 1.0712722725851829, + "grad_norm": 0.265625, + "learning_rate": 0.00016611680940780557, + "loss": 1.4722, + "step": 3463 + }, + { + "epoch": 1.07158487027196, + "grad_norm": 0.267578125, + "learning_rate": 0.00016609837364441944, + "loss": 1.3546, + "step": 3464 + }, + { + "epoch": 1.0718974679587372, + "grad_norm": 0.265625, + "learning_rate": 0.00016607993389059023, + "loss": 1.7442, + "step": 3465 + }, + { + "epoch": 1.0722100656455142, + "grad_norm": 0.26171875, + "learning_rate": 0.00016606149014743125, + "loss": 1.58, + "step": 3466 + }, + { + "epoch": 1.0725226633322913, + "grad_norm": 0.25390625, + "learning_rate": 0.0001660430424160559, + "loss": 1.4769, + "step": 3467 + }, + { + "epoch": 1.0728352610190686, + "grad_norm": 0.255859375, + "learning_rate": 0.00016602459069757795, + "loss": 1.6728, + "step": 3468 + }, + { + "epoch": 1.0731478587058456, + "grad_norm": 0.275390625, + "learning_rate": 0.0001660061349931113, + "loss": 1.6091, + "step": 3469 + }, + { + "epoch": 1.0734604563926227, + "grad_norm": 0.2578125, + "learning_rate": 0.00016598767530377016, + "loss": 1.5068, + "step": 3470 + }, + { + "epoch": 1.0737730540793997, + "grad_norm": 0.255859375, + "learning_rate": 0.00016596921163066899, + "loss": 1.4278, + "step": 3471 + }, + { + "epoch": 1.074085651766177, + "grad_norm": 0.27734375, + "learning_rate": 0.00016595074397492246, + "loss": 1.458, + "step": 3472 + }, + { + "epoch": 1.074398249452954, + "grad_norm": 0.2734375, + "learning_rate": 0.00016593227233764545, + "loss": 1.6513, + "step": 3473 + }, + { + "epoch": 1.074710847139731, + "grad_norm": 0.25390625, + "learning_rate": 0.00016591379671995315, + "loss": 1.397, + "step": 3474 + }, + { + "epoch": 1.0750234448265084, + "grad_norm": 0.265625, + "learning_rate": 0.00016589531712296092, + "loss": 1.5697, + "step": 3475 + }, + { + "epoch": 1.0753360425132854, + "grad_norm": 0.255859375, + "learning_rate": 0.00016587683354778445, + "loss": 1.4832, + "step": 3476 + }, + { + "epoch": 1.0756486402000625, + "grad_norm": 0.25390625, + "learning_rate": 0.00016585834599553958, + "loss": 1.6339, + "step": 3477 + }, + { + "epoch": 1.0759612378868397, + "grad_norm": 0.263671875, + "learning_rate": 0.00016583985446734246, + "loss": 1.6584, + "step": 3478 + }, + { + "epoch": 1.0762738355736168, + "grad_norm": 0.26953125, + "learning_rate": 0.00016582135896430945, + "loss": 1.5755, + "step": 3479 + }, + { + "epoch": 1.0765864332603938, + "grad_norm": 0.26171875, + "learning_rate": 0.00016580285948755705, + "loss": 1.4067, + "step": 3480 + }, + { + "epoch": 1.076899030947171, + "grad_norm": 0.263671875, + "learning_rate": 0.0001657843560382022, + "loss": 1.5585, + "step": 3481 + }, + { + "epoch": 1.0772116286339481, + "grad_norm": 0.27734375, + "learning_rate": 0.00016576584861736197, + "loss": 1.7006, + "step": 3482 + }, + { + "epoch": 1.0775242263207252, + "grad_norm": 0.259765625, + "learning_rate": 0.00016574733722615363, + "loss": 1.2913, + "step": 3483 + }, + { + "epoch": 1.0778368240075022, + "grad_norm": 0.2578125, + "learning_rate": 0.00016572882186569477, + "loss": 1.6498, + "step": 3484 + }, + { + "epoch": 1.0781494216942795, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016571030253710315, + "loss": 1.6436, + "step": 3485 + }, + { + "epoch": 1.0784620193810566, + "grad_norm": 0.25390625, + "learning_rate": 0.00016569177924149686, + "loss": 1.5416, + "step": 3486 + }, + { + "epoch": 1.0787746170678336, + "grad_norm": 0.271484375, + "learning_rate": 0.00016567325197999413, + "loss": 1.3999, + "step": 3487 + }, + { + "epoch": 1.079087214754611, + "grad_norm": 0.25390625, + "learning_rate": 0.0001656547207537135, + "loss": 1.4576, + "step": 3488 + }, + { + "epoch": 1.079399812441388, + "grad_norm": 0.275390625, + "learning_rate": 0.00016563618556377372, + "loss": 1.7535, + "step": 3489 + }, + { + "epoch": 1.079712410128165, + "grad_norm": 0.259765625, + "learning_rate": 0.00016561764641129372, + "loss": 1.5851, + "step": 3490 + }, + { + "epoch": 1.0800250078149423, + "grad_norm": 0.2578125, + "learning_rate": 0.00016559910329739285, + "loss": 1.4995, + "step": 3491 + }, + { + "epoch": 1.0803376055017193, + "grad_norm": 0.2734375, + "learning_rate": 0.0001655805562231905, + "loss": 1.6751, + "step": 3492 + }, + { + "epoch": 1.0806502031884964, + "grad_norm": 0.259765625, + "learning_rate": 0.00016556200518980641, + "loss": 1.4372, + "step": 3493 + }, + { + "epoch": 1.0809628008752736, + "grad_norm": 0.267578125, + "learning_rate": 0.0001655434501983605, + "loss": 1.7528, + "step": 3494 + }, + { + "epoch": 1.0812753985620507, + "grad_norm": 0.275390625, + "learning_rate": 0.000165524891249973, + "loss": 1.5594, + "step": 3495 + }, + { + "epoch": 1.0815879962488277, + "grad_norm": 0.259765625, + "learning_rate": 0.0001655063283457643, + "loss": 1.5924, + "step": 3496 + }, + { + "epoch": 1.0819005939356048, + "grad_norm": 0.26953125, + "learning_rate": 0.00016548776148685512, + "loss": 1.7186, + "step": 3497 + }, + { + "epoch": 1.082213191622382, + "grad_norm": 0.265625, + "learning_rate": 0.00016546919067436628, + "loss": 1.6862, + "step": 3498 + }, + { + "epoch": 1.082525789309159, + "grad_norm": 0.25, + "learning_rate": 0.000165450615909419, + "loss": 1.4892, + "step": 3499 + }, + { + "epoch": 1.0828383869959362, + "grad_norm": 0.24609375, + "learning_rate": 0.0001654320371931346, + "loss": 1.43, + "step": 3500 + }, + { + "epoch": 1.0831509846827134, + "grad_norm": 0.33984375, + "learning_rate": 0.00016541345452663478, + "loss": 2.2191, + "step": 3501 + }, + { + "epoch": 1.0834635823694905, + "grad_norm": 0.255859375, + "learning_rate": 0.00016539486791104132, + "loss": 1.3285, + "step": 3502 + }, + { + "epoch": 1.0837761800562675, + "grad_norm": 0.2578125, + "learning_rate": 0.00016537627734747635, + "loss": 1.4255, + "step": 3503 + }, + { + "epoch": 1.0840887777430448, + "grad_norm": 0.26171875, + "learning_rate": 0.0001653576828370622, + "loss": 1.9487, + "step": 3504 + }, + { + "epoch": 1.0844013754298218, + "grad_norm": 0.2578125, + "learning_rate": 0.0001653390843809215, + "loss": 1.5777, + "step": 3505 + }, + { + "epoch": 1.084713973116599, + "grad_norm": 0.255859375, + "learning_rate": 0.00016532048198017696, + "loss": 1.3895, + "step": 3506 + }, + { + "epoch": 1.0850265708033762, + "grad_norm": 0.255859375, + "learning_rate": 0.0001653018756359517, + "loss": 1.5765, + "step": 3507 + }, + { + "epoch": 1.0853391684901532, + "grad_norm": 0.263671875, + "learning_rate": 0.000165283265349369, + "loss": 1.7731, + "step": 3508 + }, + { + "epoch": 1.0856517661769303, + "grad_norm": 0.26953125, + "learning_rate": 0.00016526465112155238, + "loss": 1.6334, + "step": 3509 + }, + { + "epoch": 1.0859643638637073, + "grad_norm": 0.265625, + "learning_rate": 0.00016524603295362558, + "loss": 1.3732, + "step": 3510 + }, + { + "epoch": 1.0862769615504846, + "grad_norm": 0.255859375, + "learning_rate": 0.00016522741084671268, + "loss": 1.7101, + "step": 3511 + }, + { + "epoch": 1.0865895592372616, + "grad_norm": 0.26171875, + "learning_rate": 0.0001652087848019378, + "loss": 1.6682, + "step": 3512 + }, + { + "epoch": 1.0869021569240387, + "grad_norm": 0.25390625, + "learning_rate": 0.00016519015482042556, + "loss": 1.8203, + "step": 3513 + }, + { + "epoch": 1.087214754610816, + "grad_norm": 0.26953125, + "learning_rate": 0.00016517152090330054, + "loss": 1.544, + "step": 3514 + }, + { + "epoch": 1.087527352297593, + "grad_norm": 0.265625, + "learning_rate": 0.0001651528830516878, + "loss": 1.5914, + "step": 3515 + }, + { + "epoch": 1.08783994998437, + "grad_norm": 0.251953125, + "learning_rate": 0.00016513424126671241, + "loss": 1.8966, + "step": 3516 + }, + { + "epoch": 1.0881525476711473, + "grad_norm": 0.263671875, + "learning_rate": 0.00016511559554949993, + "loss": 1.3955, + "step": 3517 + }, + { + "epoch": 1.0884651453579244, + "grad_norm": 0.25390625, + "learning_rate": 0.00016509694590117598, + "loss": 1.6077, + "step": 3518 + }, + { + "epoch": 1.0887777430447014, + "grad_norm": 0.267578125, + "learning_rate": 0.00016507829232286644, + "loss": 1.5418, + "step": 3519 + }, + { + "epoch": 1.0890903407314787, + "grad_norm": 0.2578125, + "learning_rate": 0.00016505963481569747, + "loss": 1.638, + "step": 3520 + }, + { + "epoch": 1.0894029384182558, + "grad_norm": 0.26953125, + "learning_rate": 0.0001650409733807954, + "loss": 1.6119, + "step": 3521 + }, + { + "epoch": 1.0897155361050328, + "grad_norm": 0.24609375, + "learning_rate": 0.00016502230801928694, + "loss": 1.5711, + "step": 3522 + }, + { + "epoch": 1.0900281337918099, + "grad_norm": 0.259765625, + "learning_rate": 0.00016500363873229882, + "loss": 1.5858, + "step": 3523 + }, + { + "epoch": 1.0903407314785871, + "grad_norm": 0.271484375, + "learning_rate": 0.00016498496552095823, + "loss": 1.5925, + "step": 3524 + }, + { + "epoch": 1.0906533291653642, + "grad_norm": 0.25, + "learning_rate": 0.0001649662883863925, + "loss": 1.5732, + "step": 3525 + }, + { + "epoch": 1.0909659268521412, + "grad_norm": 0.26171875, + "learning_rate": 0.0001649476073297291, + "loss": 1.5185, + "step": 3526 + }, + { + "epoch": 1.0912785245389185, + "grad_norm": 0.251953125, + "learning_rate": 0.00016492892235209588, + "loss": 1.5267, + "step": 3527 + }, + { + "epoch": 1.0915911222256955, + "grad_norm": 0.2578125, + "learning_rate": 0.00016491023345462091, + "loss": 1.6801, + "step": 3528 + }, + { + "epoch": 1.0919037199124726, + "grad_norm": 0.25390625, + "learning_rate": 0.00016489154063843242, + "loss": 1.5361, + "step": 3529 + }, + { + "epoch": 1.0922163175992499, + "grad_norm": 0.255859375, + "learning_rate": 0.00016487284390465893, + "loss": 1.5234, + "step": 3530 + }, + { + "epoch": 1.092528915286027, + "grad_norm": 0.26953125, + "learning_rate": 0.00016485414325442918, + "loss": 1.4755, + "step": 3531 + }, + { + "epoch": 1.092841512972804, + "grad_norm": 0.263671875, + "learning_rate": 0.00016483543868887215, + "loss": 1.3309, + "step": 3532 + }, + { + "epoch": 1.0931541106595812, + "grad_norm": 0.2578125, + "learning_rate": 0.00016481673020911708, + "loss": 1.9242, + "step": 3533 + }, + { + "epoch": 1.0934667083463583, + "grad_norm": 0.2734375, + "learning_rate": 0.00016479801781629338, + "loss": 1.4854, + "step": 3534 + }, + { + "epoch": 1.0937793060331353, + "grad_norm": 0.255859375, + "learning_rate": 0.0001647793015115308, + "loss": 1.3303, + "step": 3535 + }, + { + "epoch": 1.0940919037199124, + "grad_norm": 0.255859375, + "learning_rate": 0.00016476058129595927, + "loss": 1.4886, + "step": 3536 + }, + { + "epoch": 1.0944045014066897, + "grad_norm": 0.259765625, + "learning_rate": 0.00016474185717070886, + "loss": 1.5427, + "step": 3537 + }, + { + "epoch": 1.0947170990934667, + "grad_norm": 0.267578125, + "learning_rate": 0.00016472312913691007, + "loss": 1.3118, + "step": 3538 + }, + { + "epoch": 1.0950296967802438, + "grad_norm": 0.26953125, + "learning_rate": 0.00016470439719569346, + "loss": 1.4825, + "step": 3539 + }, + { + "epoch": 1.095342294467021, + "grad_norm": 0.26171875, + "learning_rate": 0.00016468566134818997, + "loss": 1.672, + "step": 3540 + }, + { + "epoch": 1.095654892153798, + "grad_norm": 0.30859375, + "learning_rate": 0.00016466692159553066, + "loss": 2.2183, + "step": 3541 + }, + { + "epoch": 1.0959674898405751, + "grad_norm": 0.259765625, + "learning_rate": 0.0001646481779388469, + "loss": 1.6677, + "step": 3542 + }, + { + "epoch": 1.0962800875273524, + "grad_norm": 0.25, + "learning_rate": 0.00016462943037927024, + "loss": 1.3428, + "step": 3543 + }, + { + "epoch": 1.0965926852141294, + "grad_norm": 0.27734375, + "learning_rate": 0.00016461067891793252, + "loss": 1.3268, + "step": 3544 + }, + { + "epoch": 1.0969052829009065, + "grad_norm": 0.265625, + "learning_rate": 0.00016459192355596576, + "loss": 1.3895, + "step": 3545 + }, + { + "epoch": 1.0972178805876835, + "grad_norm": 0.2734375, + "learning_rate": 0.0001645731642945023, + "loss": 1.5051, + "step": 3546 + }, + { + "epoch": 1.0975304782744608, + "grad_norm": 0.2578125, + "learning_rate": 0.0001645544011346746, + "loss": 1.5549, + "step": 3547 + }, + { + "epoch": 1.0978430759612379, + "grad_norm": 0.28515625, + "learning_rate": 0.00016453563407761544, + "loss": 1.6162, + "step": 3548 + }, + { + "epoch": 1.098155673648015, + "grad_norm": 0.255859375, + "learning_rate": 0.00016451686312445783, + "loss": 1.575, + "step": 3549 + }, + { + "epoch": 1.0984682713347922, + "grad_norm": 0.2734375, + "learning_rate": 0.00016449808827633498, + "loss": 1.641, + "step": 3550 + }, + { + "epoch": 1.0987808690215692, + "grad_norm": 0.2470703125, + "learning_rate": 0.00016447930953438034, + "loss": 1.4562, + "step": 3551 + }, + { + "epoch": 1.0990934667083463, + "grad_norm": 0.271484375, + "learning_rate": 0.0001644605268997276, + "loss": 1.6116, + "step": 3552 + }, + { + "epoch": 1.0994060643951236, + "grad_norm": 0.265625, + "learning_rate": 0.00016444174037351074, + "loss": 1.408, + "step": 3553 + }, + { + "epoch": 1.0997186620819006, + "grad_norm": 0.263671875, + "learning_rate": 0.00016442294995686388, + "loss": 1.4525, + "step": 3554 + }, + { + "epoch": 1.1000312597686777, + "grad_norm": 0.271484375, + "learning_rate": 0.00016440415565092145, + "loss": 1.6818, + "step": 3555 + }, + { + "epoch": 1.100343857455455, + "grad_norm": 0.251953125, + "learning_rate": 0.00016438535745681802, + "loss": 1.4404, + "step": 3556 + }, + { + "epoch": 1.100656455142232, + "grad_norm": 0.2578125, + "learning_rate": 0.00016436655537568857, + "loss": 1.3231, + "step": 3557 + }, + { + "epoch": 1.100969052829009, + "grad_norm": 0.25390625, + "learning_rate": 0.00016434774940866814, + "loss": 1.5111, + "step": 3558 + }, + { + "epoch": 1.101281650515786, + "grad_norm": 0.275390625, + "learning_rate": 0.00016432893955689205, + "loss": 1.4369, + "step": 3559 + }, + { + "epoch": 1.1015942482025634, + "grad_norm": 0.2734375, + "learning_rate": 0.00016431012582149594, + "loss": 1.5618, + "step": 3560 + }, + { + "epoch": 1.1019068458893404, + "grad_norm": 0.26953125, + "learning_rate": 0.00016429130820361555, + "loss": 1.536, + "step": 3561 + }, + { + "epoch": 1.1022194435761175, + "grad_norm": 0.25, + "learning_rate": 0.00016427248670438697, + "loss": 1.5354, + "step": 3562 + }, + { + "epoch": 1.1025320412628947, + "grad_norm": 0.259765625, + "learning_rate": 0.0001642536613249465, + "loss": 1.9262, + "step": 3563 + }, + { + "epoch": 1.1028446389496718, + "grad_norm": 0.25, + "learning_rate": 0.00016423483206643057, + "loss": 1.5448, + "step": 3564 + }, + { + "epoch": 1.1031572366364488, + "grad_norm": 0.263671875, + "learning_rate": 0.00016421599892997595, + "loss": 1.5508, + "step": 3565 + }, + { + "epoch": 1.103469834323226, + "grad_norm": 0.267578125, + "learning_rate": 0.0001641971619167197, + "loss": 1.4567, + "step": 3566 + }, + { + "epoch": 1.1037824320100031, + "grad_norm": 0.26171875, + "learning_rate": 0.00016417832102779895, + "loss": 1.532, + "step": 3567 + }, + { + "epoch": 1.1040950296967802, + "grad_norm": 0.279296875, + "learning_rate": 0.0001641594762643512, + "loss": 1.8043, + "step": 3568 + }, + { + "epoch": 1.1044076273835572, + "grad_norm": 0.251953125, + "learning_rate": 0.00016414062762751407, + "loss": 1.5684, + "step": 3569 + }, + { + "epoch": 1.1047202250703345, + "grad_norm": 0.255859375, + "learning_rate": 0.00016412177511842554, + "loss": 1.5399, + "step": 3570 + }, + { + "epoch": 1.1050328227571116, + "grad_norm": 0.2890625, + "learning_rate": 0.00016410291873822375, + "loss": 1.8069, + "step": 3571 + }, + { + "epoch": 1.1053454204438886, + "grad_norm": 0.25390625, + "learning_rate": 0.00016408405848804703, + "loss": 1.4957, + "step": 3572 + }, + { + "epoch": 1.1056580181306659, + "grad_norm": 0.26171875, + "learning_rate": 0.00016406519436903407, + "loss": 1.589, + "step": 3573 + }, + { + "epoch": 1.105970615817443, + "grad_norm": 0.2578125, + "learning_rate": 0.00016404632638232367, + "loss": 1.3094, + "step": 3574 + }, + { + "epoch": 1.10628321350422, + "grad_norm": 0.287109375, + "learning_rate": 0.00016402745452905496, + "loss": 2.02, + "step": 3575 + }, + { + "epoch": 1.1065958111909973, + "grad_norm": 0.26953125, + "learning_rate": 0.00016400857881036717, + "loss": 1.2923, + "step": 3576 + }, + { + "epoch": 1.1069084088777743, + "grad_norm": 0.251953125, + "learning_rate": 0.00016398969922739996, + "loss": 1.4637, + "step": 3577 + }, + { + "epoch": 1.1072210065645514, + "grad_norm": 0.259765625, + "learning_rate": 0.00016397081578129304, + "loss": 1.547, + "step": 3578 + }, + { + "epoch": 1.1075336042513286, + "grad_norm": 0.26171875, + "learning_rate": 0.00016395192847318648, + "loss": 1.507, + "step": 3579 + }, + { + "epoch": 1.1078462019381057, + "grad_norm": 0.251953125, + "learning_rate": 0.00016393303730422048, + "loss": 1.78, + "step": 3580 + }, + { + "epoch": 1.1081587996248827, + "grad_norm": 0.2578125, + "learning_rate": 0.00016391414227553554, + "loss": 1.4306, + "step": 3581 + }, + { + "epoch": 1.1084713973116598, + "grad_norm": 0.26953125, + "learning_rate": 0.00016389524338827237, + "loss": 1.7353, + "step": 3582 + }, + { + "epoch": 1.108783994998437, + "grad_norm": 0.265625, + "learning_rate": 0.00016387634064357197, + "loss": 1.3343, + "step": 3583 + }, + { + "epoch": 1.109096592685214, + "grad_norm": 0.251953125, + "learning_rate": 0.0001638574340425755, + "loss": 1.4785, + "step": 3584 + }, + { + "epoch": 1.1094091903719911, + "grad_norm": 0.2578125, + "learning_rate": 0.00016383852358642432, + "loss": 1.7248, + "step": 3585 + }, + { + "epoch": 1.1097217880587684, + "grad_norm": 0.26953125, + "learning_rate": 0.00016381960927626014, + "loss": 1.6223, + "step": 3586 + }, + { + "epoch": 1.1100343857455455, + "grad_norm": 0.2890625, + "learning_rate": 0.00016380069111322483, + "loss": 1.7505, + "step": 3587 + }, + { + "epoch": 1.1103469834323225, + "grad_norm": 0.251953125, + "learning_rate": 0.00016378176909846048, + "loss": 1.4425, + "step": 3588 + }, + { + "epoch": 1.1106595811190998, + "grad_norm": 0.26953125, + "learning_rate": 0.0001637628432331095, + "loss": 1.4442, + "step": 3589 + }, + { + "epoch": 1.1109721788058768, + "grad_norm": 0.265625, + "learning_rate": 0.00016374391351831435, + "loss": 1.5085, + "step": 3590 + }, + { + "epoch": 1.111284776492654, + "grad_norm": 0.265625, + "learning_rate": 0.00016372497995521793, + "loss": 1.7437, + "step": 3591 + }, + { + "epoch": 1.1115973741794312, + "grad_norm": 0.2578125, + "learning_rate": 0.0001637060425449633, + "loss": 1.7599, + "step": 3592 + }, + { + "epoch": 1.1119099718662082, + "grad_norm": 0.251953125, + "learning_rate": 0.00016368710128869367, + "loss": 1.6399, + "step": 3593 + }, + { + "epoch": 1.1122225695529853, + "grad_norm": 0.259765625, + "learning_rate": 0.00016366815618755256, + "loss": 1.7215, + "step": 3594 + }, + { + "epoch": 1.1125351672397623, + "grad_norm": 0.28515625, + "learning_rate": 0.00016364920724268377, + "loss": 1.5205, + "step": 3595 + }, + { + "epoch": 1.1128477649265396, + "grad_norm": 0.265625, + "learning_rate": 0.0001636302544552312, + "loss": 1.5398, + "step": 3596 + }, + { + "epoch": 1.1131603626133166, + "grad_norm": 0.24609375, + "learning_rate": 0.00016361129782633911, + "loss": 1.6437, + "step": 3597 + }, + { + "epoch": 1.1134729603000937, + "grad_norm": 0.259765625, + "learning_rate": 0.0001635923373571519, + "loss": 1.5017, + "step": 3598 + }, + { + "epoch": 1.113785557986871, + "grad_norm": 0.265625, + "learning_rate": 0.00016357337304881423, + "loss": 1.724, + "step": 3599 + }, + { + "epoch": 1.114098155673648, + "grad_norm": 0.271484375, + "learning_rate": 0.00016355440490247103, + "loss": 1.4658, + "step": 3600 + }, + { + "epoch": 1.114410753360425, + "grad_norm": 0.263671875, + "learning_rate": 0.0001635354329192674, + "loss": 1.8218, + "step": 3601 + }, + { + "epoch": 1.1147233510472023, + "grad_norm": 0.267578125, + "learning_rate": 0.00016351645710034873, + "loss": 1.6092, + "step": 3602 + }, + { + "epoch": 1.1150359487339794, + "grad_norm": 0.267578125, + "learning_rate": 0.00016349747744686064, + "loss": 1.7769, + "step": 3603 + }, + { + "epoch": 1.1153485464207564, + "grad_norm": 0.259765625, + "learning_rate": 0.00016347849395994887, + "loss": 1.4195, + "step": 3604 + }, + { + "epoch": 1.1156611441075337, + "grad_norm": 0.271484375, + "learning_rate": 0.00016345950664075956, + "loss": 1.5754, + "step": 3605 + }, + { + "epoch": 1.1159737417943107, + "grad_norm": 0.259765625, + "learning_rate": 0.00016344051549043896, + "loss": 1.5302, + "step": 3606 + }, + { + "epoch": 1.1162863394810878, + "grad_norm": 0.259765625, + "learning_rate": 0.0001634215205101336, + "loss": 1.5346, + "step": 3607 + }, + { + "epoch": 1.1165989371678648, + "grad_norm": 0.25390625, + "learning_rate": 0.0001634025217009902, + "loss": 1.5775, + "step": 3608 + }, + { + "epoch": 1.1169115348546421, + "grad_norm": 0.267578125, + "learning_rate": 0.0001633835190641558, + "loss": 1.4287, + "step": 3609 + }, + { + "epoch": 1.1172241325414192, + "grad_norm": 0.26171875, + "learning_rate": 0.0001633645126007776, + "loss": 1.6013, + "step": 3610 + }, + { + "epoch": 1.1175367302281962, + "grad_norm": 0.26953125, + "learning_rate": 0.000163345502312003, + "loss": 1.7343, + "step": 3611 + }, + { + "epoch": 1.1178493279149735, + "grad_norm": 0.265625, + "learning_rate": 0.00016332648819897968, + "loss": 1.8112, + "step": 3612 + }, + { + "epoch": 1.1181619256017505, + "grad_norm": 0.265625, + "learning_rate": 0.00016330747026285563, + "loss": 1.6694, + "step": 3613 + }, + { + "epoch": 1.1184745232885276, + "grad_norm": 0.259765625, + "learning_rate": 0.0001632884485047789, + "loss": 1.6057, + "step": 3614 + }, + { + "epoch": 1.1187871209753049, + "grad_norm": 0.271484375, + "learning_rate": 0.00016326942292589785, + "loss": 1.2595, + "step": 3615 + }, + { + "epoch": 1.119099718662082, + "grad_norm": 0.279296875, + "learning_rate": 0.00016325039352736113, + "loss": 1.6727, + "step": 3616 + }, + { + "epoch": 1.119412316348859, + "grad_norm": 0.2890625, + "learning_rate": 0.0001632313603103176, + "loss": 1.6012, + "step": 3617 + }, + { + "epoch": 1.1197249140356362, + "grad_norm": 0.2578125, + "learning_rate": 0.00016321232327591622, + "loss": 1.5811, + "step": 3618 + }, + { + "epoch": 1.1200375117224133, + "grad_norm": 0.26953125, + "learning_rate": 0.00016319328242530635, + "loss": 1.5852, + "step": 3619 + }, + { + "epoch": 1.1203501094091903, + "grad_norm": 0.265625, + "learning_rate": 0.00016317423775963748, + "loss": 1.9542, + "step": 3620 + }, + { + "epoch": 1.1206627070959674, + "grad_norm": 0.283203125, + "learning_rate": 0.0001631551892800594, + "loss": 1.5583, + "step": 3621 + }, + { + "epoch": 1.1209753047827447, + "grad_norm": 0.2578125, + "learning_rate": 0.000163136136987722, + "loss": 1.5256, + "step": 3622 + }, + { + "epoch": 1.1212879024695217, + "grad_norm": 0.263671875, + "learning_rate": 0.00016311708088377562, + "loss": 1.6357, + "step": 3623 + }, + { + "epoch": 1.1216005001562988, + "grad_norm": 0.28125, + "learning_rate": 0.0001630980209693706, + "loss": 1.3957, + "step": 3624 + }, + { + "epoch": 1.121913097843076, + "grad_norm": 0.26171875, + "learning_rate": 0.0001630789572456577, + "loss": 1.6948, + "step": 3625 + }, + { + "epoch": 1.122225695529853, + "grad_norm": 0.271484375, + "learning_rate": 0.0001630598897137877, + "loss": 1.5104, + "step": 3626 + }, + { + "epoch": 1.1225382932166301, + "grad_norm": 0.25390625, + "learning_rate": 0.00016304081837491185, + "loss": 1.6538, + "step": 3627 + }, + { + "epoch": 1.1228508909034074, + "grad_norm": 0.2734375, + "learning_rate": 0.00016302174323018146, + "loss": 1.495, + "step": 3628 + }, + { + "epoch": 1.1231634885901844, + "grad_norm": 0.26171875, + "learning_rate": 0.0001630026642807481, + "loss": 1.4547, + "step": 3629 + }, + { + "epoch": 1.1234760862769615, + "grad_norm": 0.267578125, + "learning_rate": 0.00016298358152776361, + "loss": 1.6914, + "step": 3630 + }, + { + "epoch": 1.1237886839637388, + "grad_norm": 0.259765625, + "learning_rate": 0.00016296449497238004, + "loss": 1.7995, + "step": 3631 + }, + { + "epoch": 1.1241012816505158, + "grad_norm": 0.259765625, + "learning_rate": 0.00016294540461574968, + "loss": 1.3836, + "step": 3632 + }, + { + "epoch": 1.1244138793372929, + "grad_norm": 0.255859375, + "learning_rate": 0.00016292631045902506, + "loss": 1.4697, + "step": 3633 + }, + { + "epoch": 1.12472647702407, + "grad_norm": 0.259765625, + "learning_rate": 0.00016290721250335883, + "loss": 1.5252, + "step": 3634 + }, + { + "epoch": 1.1250390747108472, + "grad_norm": 0.26171875, + "learning_rate": 0.00016288811074990407, + "loss": 1.6655, + "step": 3635 + }, + { + "epoch": 1.1253516723976242, + "grad_norm": 0.265625, + "learning_rate": 0.0001628690051998139, + "loss": 1.4658, + "step": 3636 + }, + { + "epoch": 1.1256642700844013, + "grad_norm": 0.26171875, + "learning_rate": 0.0001628498958542418, + "loss": 1.6769, + "step": 3637 + }, + { + "epoch": 1.1259768677711786, + "grad_norm": 0.263671875, + "learning_rate": 0.00016283078271434135, + "loss": 1.71, + "step": 3638 + }, + { + "epoch": 1.1262894654579556, + "grad_norm": 0.26171875, + "learning_rate": 0.00016281166578126653, + "loss": 1.4462, + "step": 3639 + }, + { + "epoch": 1.1266020631447327, + "grad_norm": 0.294921875, + "learning_rate": 0.00016279254505617138, + "loss": 2.2732, + "step": 3640 + }, + { + "epoch": 1.12691466083151, + "grad_norm": 0.2578125, + "learning_rate": 0.00016277342054021022, + "loss": 1.4325, + "step": 3641 + }, + { + "epoch": 1.127227258518287, + "grad_norm": 0.28125, + "learning_rate": 0.00016275429223453776, + "loss": 1.6099, + "step": 3642 + }, + { + "epoch": 1.127539856205064, + "grad_norm": 0.271484375, + "learning_rate": 0.00016273516014030865, + "loss": 1.7282, + "step": 3643 + }, + { + "epoch": 1.1278524538918413, + "grad_norm": 0.265625, + "learning_rate": 0.000162716024258678, + "loss": 1.5569, + "step": 3644 + }, + { + "epoch": 1.1281650515786183, + "grad_norm": 0.25, + "learning_rate": 0.00016269688459080104, + "loss": 1.4227, + "step": 3645 + }, + { + "epoch": 1.1284776492653954, + "grad_norm": 0.275390625, + "learning_rate": 0.00016267774113783325, + "loss": 1.5628, + "step": 3646 + }, + { + "epoch": 1.1287902469521724, + "grad_norm": 0.259765625, + "learning_rate": 0.00016265859390093037, + "loss": 1.9694, + "step": 3647 + }, + { + "epoch": 1.1291028446389497, + "grad_norm": 0.275390625, + "learning_rate": 0.00016263944288124832, + "loss": 1.3973, + "step": 3648 + }, + { + "epoch": 1.1294154423257268, + "grad_norm": 0.244140625, + "learning_rate": 0.00016262028807994332, + "loss": 1.5886, + "step": 3649 + }, + { + "epoch": 1.1297280400125038, + "grad_norm": 0.26953125, + "learning_rate": 0.0001626011294981717, + "loss": 1.8958, + "step": 3650 + }, + { + "epoch": 1.130040637699281, + "grad_norm": 0.2490234375, + "learning_rate": 0.0001625819671370901, + "loss": 1.5393, + "step": 3651 + }, + { + "epoch": 1.1303532353860581, + "grad_norm": 0.259765625, + "learning_rate": 0.0001625628009978554, + "loss": 1.4789, + "step": 3652 + }, + { + "epoch": 1.1306658330728352, + "grad_norm": 0.251953125, + "learning_rate": 0.00016254363108162472, + "loss": 1.5094, + "step": 3653 + }, + { + "epoch": 1.1309784307596125, + "grad_norm": 0.263671875, + "learning_rate": 0.00016252445738955529, + "loss": 1.6394, + "step": 3654 + }, + { + "epoch": 1.1312910284463895, + "grad_norm": 0.251953125, + "learning_rate": 0.0001625052799228047, + "loss": 1.4005, + "step": 3655 + }, + { + "epoch": 1.1316036261331666, + "grad_norm": 0.26171875, + "learning_rate": 0.00016248609868253072, + "loss": 1.3923, + "step": 3656 + }, + { + "epoch": 1.1319162238199438, + "grad_norm": 0.25390625, + "learning_rate": 0.00016246691366989132, + "loss": 1.4759, + "step": 3657 + }, + { + "epoch": 1.1322288215067209, + "grad_norm": 0.265625, + "learning_rate": 0.00016244772488604477, + "loss": 1.4589, + "step": 3658 + }, + { + "epoch": 1.132541419193498, + "grad_norm": 0.265625, + "learning_rate": 0.00016242853233214944, + "loss": 1.6923, + "step": 3659 + }, + { + "epoch": 1.132854016880275, + "grad_norm": 0.263671875, + "learning_rate": 0.00016240933600936413, + "loss": 1.5223, + "step": 3660 + }, + { + "epoch": 1.1331666145670523, + "grad_norm": 0.251953125, + "learning_rate": 0.00016239013591884765, + "loss": 1.8563, + "step": 3661 + }, + { + "epoch": 1.1334792122538293, + "grad_norm": 0.26171875, + "learning_rate": 0.0001623709320617591, + "loss": 1.7337, + "step": 3662 + }, + { + "epoch": 1.1337918099406064, + "grad_norm": 0.26171875, + "learning_rate": 0.00016235172443925796, + "loss": 1.313, + "step": 3663 + }, + { + "epoch": 1.1341044076273836, + "grad_norm": 0.265625, + "learning_rate": 0.00016233251305250375, + "loss": 1.8978, + "step": 3664 + }, + { + "epoch": 1.1344170053141607, + "grad_norm": 0.26171875, + "learning_rate": 0.0001623132979026563, + "loss": 1.7873, + "step": 3665 + }, + { + "epoch": 1.1347296030009377, + "grad_norm": 0.259765625, + "learning_rate": 0.00016229407899087566, + "loss": 1.6152, + "step": 3666 + }, + { + "epoch": 1.135042200687715, + "grad_norm": 0.25, + "learning_rate": 0.00016227485631832206, + "loss": 1.3307, + "step": 3667 + }, + { + "epoch": 1.135354798374492, + "grad_norm": 0.275390625, + "learning_rate": 0.00016225562988615605, + "loss": 1.5199, + "step": 3668 + }, + { + "epoch": 1.135667396061269, + "grad_norm": 0.25390625, + "learning_rate": 0.0001622363996955383, + "loss": 1.5746, + "step": 3669 + }, + { + "epoch": 1.1359799937480464, + "grad_norm": 0.267578125, + "learning_rate": 0.00016221716574762982, + "loss": 1.734, + "step": 3670 + }, + { + "epoch": 1.1362925914348234, + "grad_norm": 0.2734375, + "learning_rate": 0.00016219792804359173, + "loss": 1.5776, + "step": 3671 + }, + { + "epoch": 1.1366051891216005, + "grad_norm": 0.267578125, + "learning_rate": 0.00016217868658458554, + "loss": 1.3582, + "step": 3672 + }, + { + "epoch": 1.1369177868083775, + "grad_norm": 0.248046875, + "learning_rate": 0.00016215944137177273, + "loss": 1.5522, + "step": 3673 + }, + { + "epoch": 1.1372303844951548, + "grad_norm": 0.26171875, + "learning_rate": 0.00016214019240631523, + "loss": 1.5427, + "step": 3674 + }, + { + "epoch": 1.1375429821819318, + "grad_norm": 0.26171875, + "learning_rate": 0.00016212093968937517, + "loss": 1.5371, + "step": 3675 + }, + { + "epoch": 1.1378555798687089, + "grad_norm": 0.25390625, + "learning_rate": 0.0001621016832221148, + "loss": 1.5362, + "step": 3676 + }, + { + "epoch": 1.1381681775554862, + "grad_norm": 0.259765625, + "learning_rate": 0.00016208242300569668, + "loss": 1.5966, + "step": 3677 + }, + { + "epoch": 1.1384807752422632, + "grad_norm": 0.267578125, + "learning_rate": 0.00016206315904128358, + "loss": 1.5752, + "step": 3678 + }, + { + "epoch": 1.1387933729290403, + "grad_norm": 0.26171875, + "learning_rate": 0.00016204389133003848, + "loss": 1.6001, + "step": 3679 + }, + { + "epoch": 1.1391059706158175, + "grad_norm": 0.26171875, + "learning_rate": 0.00016202461987312457, + "loss": 1.7705, + "step": 3680 + }, + { + "epoch": 1.1394185683025946, + "grad_norm": 0.26171875, + "learning_rate": 0.00016200534467170533, + "loss": 1.9231, + "step": 3681 + }, + { + "epoch": 1.1397311659893716, + "grad_norm": 0.255859375, + "learning_rate": 0.00016198606572694443, + "loss": 1.4175, + "step": 3682 + }, + { + "epoch": 1.140043763676149, + "grad_norm": 0.267578125, + "learning_rate": 0.00016196678304000573, + "loss": 1.6075, + "step": 3683 + }, + { + "epoch": 1.140356361362926, + "grad_norm": 0.25390625, + "learning_rate": 0.00016194749661205341, + "loss": 1.3712, + "step": 3684 + }, + { + "epoch": 1.140668959049703, + "grad_norm": 0.275390625, + "learning_rate": 0.00016192820644425176, + "loss": 1.6146, + "step": 3685 + }, + { + "epoch": 1.14098155673648, + "grad_norm": 0.267578125, + "learning_rate": 0.0001619089125377654, + "loss": 1.5327, + "step": 3686 + }, + { + "epoch": 1.1412941544232573, + "grad_norm": 0.26953125, + "learning_rate": 0.00016188961489375903, + "loss": 1.4308, + "step": 3687 + }, + { + "epoch": 1.1416067521100344, + "grad_norm": 0.26171875, + "learning_rate": 0.0001618703135133978, + "loss": 1.693, + "step": 3688 + }, + { + "epoch": 1.1419193497968114, + "grad_norm": 0.259765625, + "learning_rate": 0.0001618510083978469, + "loss": 1.2902, + "step": 3689 + }, + { + "epoch": 1.1422319474835887, + "grad_norm": 0.267578125, + "learning_rate": 0.00016183169954827177, + "loss": 1.4811, + "step": 3690 + }, + { + "epoch": 1.1425445451703657, + "grad_norm": 0.26953125, + "learning_rate": 0.00016181238696583815, + "loss": 1.4203, + "step": 3691 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.28515625, + "learning_rate": 0.000161793070651712, + "loss": 1.515, + "step": 3692 + }, + { + "epoch": 1.14316974054392, + "grad_norm": 0.26953125, + "learning_rate": 0.0001617737506070594, + "loss": 1.4142, + "step": 3693 + }, + { + "epoch": 1.1434823382306971, + "grad_norm": 0.25, + "learning_rate": 0.00016175442683304673, + "loss": 1.3988, + "step": 3694 + }, + { + "epoch": 1.1437949359174742, + "grad_norm": 0.263671875, + "learning_rate": 0.00016173509933084068, + "loss": 1.5393, + "step": 3695 + }, + { + "epoch": 1.1441075336042514, + "grad_norm": 0.271484375, + "learning_rate": 0.00016171576810160797, + "loss": 1.6045, + "step": 3696 + }, + { + "epoch": 1.1444201312910285, + "grad_norm": 0.279296875, + "learning_rate": 0.00016169643314651572, + "loss": 1.5186, + "step": 3697 + }, + { + "epoch": 1.1447327289778055, + "grad_norm": 0.2578125, + "learning_rate": 0.0001616770944667312, + "loss": 1.6072, + "step": 3698 + }, + { + "epoch": 1.1450453266645826, + "grad_norm": 0.2734375, + "learning_rate": 0.00016165775206342185, + "loss": 1.6905, + "step": 3699 + }, + { + "epoch": 1.1453579243513599, + "grad_norm": 0.267578125, + "learning_rate": 0.00016163840593775541, + "loss": 1.5007, + "step": 3700 + }, + { + "epoch": 1.145670522038137, + "grad_norm": 0.265625, + "learning_rate": 0.0001616190560908999, + "loss": 1.6387, + "step": 3701 + }, + { + "epoch": 1.145983119724914, + "grad_norm": 0.265625, + "learning_rate": 0.00016159970252402345, + "loss": 1.347, + "step": 3702 + }, + { + "epoch": 1.1462957174116912, + "grad_norm": 0.2578125, + "learning_rate": 0.00016158034523829445, + "loss": 1.5434, + "step": 3703 + }, + { + "epoch": 1.1466083150984683, + "grad_norm": 0.28515625, + "learning_rate": 0.00016156098423488155, + "loss": 1.5995, + "step": 3704 + }, + { + "epoch": 1.1469209127852453, + "grad_norm": 0.25390625, + "learning_rate": 0.0001615416195149536, + "loss": 1.7075, + "step": 3705 + }, + { + "epoch": 1.1472335104720226, + "grad_norm": 0.275390625, + "learning_rate": 0.00016152225107967963, + "loss": 1.6339, + "step": 3706 + }, + { + "epoch": 1.1475461081587996, + "grad_norm": 0.2734375, + "learning_rate": 0.00016150287893022894, + "loss": 1.6718, + "step": 3707 + }, + { + "epoch": 1.1478587058455767, + "grad_norm": 0.255859375, + "learning_rate": 0.00016148350306777111, + "loss": 1.7101, + "step": 3708 + }, + { + "epoch": 1.148171303532354, + "grad_norm": 0.259765625, + "learning_rate": 0.00016146412349347583, + "loss": 1.8711, + "step": 3709 + }, + { + "epoch": 1.148483901219131, + "grad_norm": 0.259765625, + "learning_rate": 0.00016144474020851312, + "loss": 1.4087, + "step": 3710 + }, + { + "epoch": 1.148796498905908, + "grad_norm": 0.265625, + "learning_rate": 0.00016142535321405312, + "loss": 1.2498, + "step": 3711 + }, + { + "epoch": 1.1491090965926851, + "grad_norm": 0.2578125, + "learning_rate": 0.00016140596251126626, + "loss": 1.6021, + "step": 3712 + }, + { + "epoch": 1.1494216942794624, + "grad_norm": 0.2578125, + "learning_rate": 0.00016138656810132322, + "loss": 1.5073, + "step": 3713 + }, + { + "epoch": 1.1497342919662394, + "grad_norm": 0.251953125, + "learning_rate": 0.00016136716998539483, + "loss": 1.4993, + "step": 3714 + }, + { + "epoch": 1.1500468896530165, + "grad_norm": 0.26171875, + "learning_rate": 0.0001613477681646522, + "loss": 1.5221, + "step": 3715 + }, + { + "epoch": 1.1503594873397938, + "grad_norm": 0.259765625, + "learning_rate": 0.0001613283626402666, + "loss": 1.3269, + "step": 3716 + }, + { + "epoch": 1.1506720850265708, + "grad_norm": 0.259765625, + "learning_rate": 0.00016130895341340962, + "loss": 1.6539, + "step": 3717 + }, + { + "epoch": 1.1509846827133479, + "grad_norm": 0.279296875, + "learning_rate": 0.00016128954048525297, + "loss": 1.6418, + "step": 3718 + }, + { + "epoch": 1.1512972804001251, + "grad_norm": 0.26171875, + "learning_rate": 0.0001612701238569687, + "loss": 1.4092, + "step": 3719 + }, + { + "epoch": 1.1516098780869022, + "grad_norm": 0.2578125, + "learning_rate": 0.00016125070352972896, + "loss": 1.4823, + "step": 3720 + }, + { + "epoch": 1.1519224757736792, + "grad_norm": 0.275390625, + "learning_rate": 0.00016123127950470618, + "loss": 1.5155, + "step": 3721 + }, + { + "epoch": 1.1522350734604565, + "grad_norm": 0.267578125, + "learning_rate": 0.000161211851783073, + "loss": 1.4642, + "step": 3722 + }, + { + "epoch": 1.1525476711472336, + "grad_norm": 0.275390625, + "learning_rate": 0.00016119242036600237, + "loss": 1.8195, + "step": 3723 + }, + { + "epoch": 1.1528602688340106, + "grad_norm": 0.259765625, + "learning_rate": 0.00016117298525466733, + "loss": 1.3579, + "step": 3724 + }, + { + "epoch": 1.1531728665207877, + "grad_norm": 0.28515625, + "learning_rate": 0.00016115354645024126, + "loss": 1.6316, + "step": 3725 + }, + { + "epoch": 1.153485464207565, + "grad_norm": 0.26953125, + "learning_rate": 0.0001611341039538976, + "loss": 1.7061, + "step": 3726 + }, + { + "epoch": 1.153798061894342, + "grad_norm": 0.25390625, + "learning_rate": 0.00016111465776681022, + "loss": 1.5465, + "step": 3727 + }, + { + "epoch": 1.154110659581119, + "grad_norm": 0.2578125, + "learning_rate": 0.00016109520789015305, + "loss": 1.4982, + "step": 3728 + }, + { + "epoch": 1.1544232572678963, + "grad_norm": 0.267578125, + "learning_rate": 0.0001610757543251003, + "loss": 1.7491, + "step": 3729 + }, + { + "epoch": 1.1547358549546733, + "grad_norm": 0.26953125, + "learning_rate": 0.0001610562970728265, + "loss": 1.6059, + "step": 3730 + }, + { + "epoch": 1.1550484526414504, + "grad_norm": 0.2734375, + "learning_rate": 0.00016103683613450618, + "loss": 1.3079, + "step": 3731 + }, + { + "epoch": 1.1553610503282277, + "grad_norm": 0.263671875, + "learning_rate": 0.0001610173715113143, + "loss": 1.5474, + "step": 3732 + }, + { + "epoch": 1.1556736480150047, + "grad_norm": 0.2734375, + "learning_rate": 0.00016099790320442593, + "loss": 1.6536, + "step": 3733 + }, + { + "epoch": 1.1559862457017818, + "grad_norm": 0.265625, + "learning_rate": 0.00016097843121501646, + "loss": 1.6882, + "step": 3734 + }, + { + "epoch": 1.156298843388559, + "grad_norm": 0.26953125, + "learning_rate": 0.00016095895554426134, + "loss": 1.4942, + "step": 3735 + }, + { + "epoch": 1.156611441075336, + "grad_norm": 0.25390625, + "learning_rate": 0.00016093947619333644, + "loss": 1.6029, + "step": 3736 + }, + { + "epoch": 1.1569240387621131, + "grad_norm": 0.28125, + "learning_rate": 0.00016091999316341767, + "loss": 1.5016, + "step": 3737 + }, + { + "epoch": 1.1572366364488902, + "grad_norm": 0.24609375, + "learning_rate": 0.0001609005064556813, + "loss": 1.8828, + "step": 3738 + }, + { + "epoch": 1.1575492341356675, + "grad_norm": 0.25, + "learning_rate": 0.00016088101607130377, + "loss": 1.2023, + "step": 3739 + }, + { + "epoch": 1.1578618318224445, + "grad_norm": 0.28125, + "learning_rate": 0.00016086152201146166, + "loss": 1.6386, + "step": 3740 + }, + { + "epoch": 1.1581744295092216, + "grad_norm": 0.271484375, + "learning_rate": 0.00016084202427733198, + "loss": 1.413, + "step": 3741 + }, + { + "epoch": 1.1584870271959988, + "grad_norm": 0.24609375, + "learning_rate": 0.00016082252287009173, + "loss": 1.6755, + "step": 3742 + }, + { + "epoch": 1.1587996248827759, + "grad_norm": 0.27734375, + "learning_rate": 0.00016080301779091826, + "loss": 1.8094, + "step": 3743 + }, + { + "epoch": 1.159112222569553, + "grad_norm": 0.27734375, + "learning_rate": 0.00016078350904098914, + "loss": 1.5788, + "step": 3744 + }, + { + "epoch": 1.15942482025633, + "grad_norm": 0.25, + "learning_rate": 0.00016076399662148208, + "loss": 1.2638, + "step": 3745 + }, + { + "epoch": 1.1597374179431073, + "grad_norm": 0.25390625, + "learning_rate": 0.00016074448053357516, + "loss": 1.6745, + "step": 3746 + }, + { + "epoch": 1.1600500156298843, + "grad_norm": 0.259765625, + "learning_rate": 0.0001607249607784465, + "loss": 1.5774, + "step": 3747 + }, + { + "epoch": 1.1603626133166616, + "grad_norm": 0.271484375, + "learning_rate": 0.00016070543735727464, + "loss": 1.7768, + "step": 3748 + }, + { + "epoch": 1.1606752110034386, + "grad_norm": 0.25, + "learning_rate": 0.00016068591027123812, + "loss": 1.4988, + "step": 3749 + }, + { + "epoch": 1.1609878086902157, + "grad_norm": 0.271484375, + "learning_rate": 0.00016066637952151587, + "loss": 1.7819, + "step": 3750 + }, + { + "epoch": 1.1613004063769927, + "grad_norm": 0.25, + "learning_rate": 0.000160646845109287, + "loss": 1.7408, + "step": 3751 + }, + { + "epoch": 1.16161300406377, + "grad_norm": 0.267578125, + "learning_rate": 0.00016062730703573076, + "loss": 1.6412, + "step": 3752 + }, + { + "epoch": 1.161925601750547, + "grad_norm": 0.27734375, + "learning_rate": 0.00016060776530202678, + "loss": 1.6517, + "step": 3753 + }, + { + "epoch": 1.162238199437324, + "grad_norm": 0.275390625, + "learning_rate": 0.00016058821990935475, + "loss": 1.4441, + "step": 3754 + }, + { + "epoch": 1.1625507971241014, + "grad_norm": 0.259765625, + "learning_rate": 0.0001605686708588947, + "loss": 1.4506, + "step": 3755 + }, + { + "epoch": 1.1628633948108784, + "grad_norm": 0.26171875, + "learning_rate": 0.0001605491181518268, + "loss": 1.5241, + "step": 3756 + }, + { + "epoch": 1.1631759924976555, + "grad_norm": 0.27734375, + "learning_rate": 0.00016052956178933147, + "loss": 1.722, + "step": 3757 + }, + { + "epoch": 1.1634885901844325, + "grad_norm": 0.25390625, + "learning_rate": 0.00016051000177258934, + "loss": 1.6285, + "step": 3758 + }, + { + "epoch": 1.1638011878712098, + "grad_norm": 0.337890625, + "learning_rate": 0.00016049043810278132, + "loss": 2.0735, + "step": 3759 + }, + { + "epoch": 1.1641137855579868, + "grad_norm": 0.265625, + "learning_rate": 0.0001604708707810885, + "loss": 1.5888, + "step": 3760 + }, + { + "epoch": 1.164426383244764, + "grad_norm": 0.28515625, + "learning_rate": 0.0001604512998086921, + "loss": 1.3625, + "step": 3761 + }, + { + "epoch": 1.1647389809315412, + "grad_norm": 0.2734375, + "learning_rate": 0.00016043172518677372, + "loss": 1.5767, + "step": 3762 + }, + { + "epoch": 1.1650515786183182, + "grad_norm": 0.26171875, + "learning_rate": 0.00016041214691651508, + "loss": 1.6241, + "step": 3763 + }, + { + "epoch": 1.1653641763050953, + "grad_norm": 0.271484375, + "learning_rate": 0.00016039256499909813, + "loss": 1.7103, + "step": 3764 + }, + { + "epoch": 1.1656767739918725, + "grad_norm": 0.259765625, + "learning_rate": 0.00016037297943570508, + "loss": 1.7265, + "step": 3765 + }, + { + "epoch": 1.1659893716786496, + "grad_norm": 0.263671875, + "learning_rate": 0.00016035339022751836, + "loss": 1.6959, + "step": 3766 + }, + { + "epoch": 1.1663019693654266, + "grad_norm": 0.26171875, + "learning_rate": 0.00016033379737572054, + "loss": 1.408, + "step": 3767 + }, + { + "epoch": 1.166614567052204, + "grad_norm": 0.26171875, + "learning_rate": 0.0001603142008814945, + "loss": 1.4535, + "step": 3768 + }, + { + "epoch": 1.166927164738981, + "grad_norm": 0.259765625, + "learning_rate": 0.00016029460074602325, + "loss": 1.5532, + "step": 3769 + }, + { + "epoch": 1.167239762425758, + "grad_norm": 0.2734375, + "learning_rate": 0.00016027499697049015, + "loss": 1.5624, + "step": 3770 + }, + { + "epoch": 1.167552360112535, + "grad_norm": 0.2734375, + "learning_rate": 0.00016025538955607865, + "loss": 1.7583, + "step": 3771 + }, + { + "epoch": 1.1678649577993123, + "grad_norm": 0.267578125, + "learning_rate": 0.00016023577850397252, + "loss": 1.6003, + "step": 3772 + }, + { + "epoch": 1.1681775554860894, + "grad_norm": 0.267578125, + "learning_rate": 0.0001602161638153557, + "loss": 1.528, + "step": 3773 + }, + { + "epoch": 1.1684901531728666, + "grad_norm": 0.271484375, + "learning_rate": 0.00016019654549141233, + "loss": 1.4343, + "step": 3774 + }, + { + "epoch": 1.1688027508596437, + "grad_norm": 0.271484375, + "learning_rate": 0.0001601769235333268, + "loss": 1.38, + "step": 3775 + }, + { + "epoch": 1.1691153485464207, + "grad_norm": 0.26171875, + "learning_rate": 0.00016015729794228366, + "loss": 1.6298, + "step": 3776 + }, + { + "epoch": 1.1694279462331978, + "grad_norm": 0.26171875, + "learning_rate": 0.00016013766871946785, + "loss": 1.5653, + "step": 3777 + }, + { + "epoch": 1.169740543919975, + "grad_norm": 0.259765625, + "learning_rate": 0.0001601180358660643, + "loss": 1.3986, + "step": 3778 + }, + { + "epoch": 1.1700531416067521, + "grad_norm": 0.267578125, + "learning_rate": 0.00016009839938325836, + "loss": 1.5358, + "step": 3779 + }, + { + "epoch": 1.1703657392935292, + "grad_norm": 0.26171875, + "learning_rate": 0.00016007875927223544, + "loss": 1.7622, + "step": 3780 + }, + { + "epoch": 1.1706783369803064, + "grad_norm": 0.265625, + "learning_rate": 0.00016005911553418126, + "loss": 1.4816, + "step": 3781 + }, + { + "epoch": 1.1709909346670835, + "grad_norm": 0.265625, + "learning_rate": 0.00016003946817028173, + "loss": 1.5785, + "step": 3782 + }, + { + "epoch": 1.1713035323538605, + "grad_norm": 0.27734375, + "learning_rate": 0.00016001981718172302, + "loss": 1.6437, + "step": 3783 + }, + { + "epoch": 1.1716161300406376, + "grad_norm": 0.263671875, + "learning_rate": 0.00016000016256969145, + "loss": 1.5012, + "step": 3784 + }, + { + "epoch": 1.1719287277274149, + "grad_norm": 0.265625, + "learning_rate": 0.00015998050433537362, + "loss": 1.8112, + "step": 3785 + }, + { + "epoch": 1.172241325414192, + "grad_norm": 0.2734375, + "learning_rate": 0.0001599608424799563, + "loss": 1.7065, + "step": 3786 + }, + { + "epoch": 1.1725539231009692, + "grad_norm": 0.287109375, + "learning_rate": 0.00015994117700462648, + "loss": 1.6396, + "step": 3787 + }, + { + "epoch": 1.1728665207877462, + "grad_norm": 0.26953125, + "learning_rate": 0.00015992150791057147, + "loss": 1.5667, + "step": 3788 + }, + { + "epoch": 1.1731791184745233, + "grad_norm": 0.2734375, + "learning_rate": 0.00015990183519897866, + "loss": 1.5552, + "step": 3789 + }, + { + "epoch": 1.1734917161613003, + "grad_norm": 0.251953125, + "learning_rate": 0.0001598821588710357, + "loss": 1.3596, + "step": 3790 + }, + { + "epoch": 1.1738043138480776, + "grad_norm": 0.267578125, + "learning_rate": 0.00015986247892793053, + "loss": 1.6007, + "step": 3791 + }, + { + "epoch": 1.1741169115348546, + "grad_norm": 0.26171875, + "learning_rate": 0.0001598427953708512, + "loss": 1.6813, + "step": 3792 + }, + { + "epoch": 1.1744295092216317, + "grad_norm": 0.251953125, + "learning_rate": 0.00015982310820098608, + "loss": 1.6009, + "step": 3793 + }, + { + "epoch": 1.174742106908409, + "grad_norm": 0.248046875, + "learning_rate": 0.00015980341741952367, + "loss": 1.4627, + "step": 3794 + }, + { + "epoch": 1.175054704595186, + "grad_norm": 0.263671875, + "learning_rate": 0.00015978372302765273, + "loss": 1.5302, + "step": 3795 + }, + { + "epoch": 1.175367302281963, + "grad_norm": 0.2734375, + "learning_rate": 0.00015976402502656227, + "loss": 1.6636, + "step": 3796 + }, + { + "epoch": 1.1756798999687401, + "grad_norm": 0.255859375, + "learning_rate": 0.00015974432341744142, + "loss": 1.4919, + "step": 3797 + }, + { + "epoch": 1.1759924976555174, + "grad_norm": 0.26953125, + "learning_rate": 0.00015972461820147968, + "loss": 1.5591, + "step": 3798 + }, + { + "epoch": 1.1763050953422944, + "grad_norm": 0.302734375, + "learning_rate": 0.00015970490937986662, + "loss": 2.0899, + "step": 3799 + }, + { + "epoch": 1.1766176930290717, + "grad_norm": 0.26171875, + "learning_rate": 0.0001596851969537921, + "loss": 1.6985, + "step": 3800 + }, + { + "epoch": 1.1769302907158488, + "grad_norm": 0.26171875, + "learning_rate": 0.00015966548092444618, + "loss": 1.448, + "step": 3801 + }, + { + "epoch": 1.1772428884026258, + "grad_norm": 0.287109375, + "learning_rate": 0.0001596457612930191, + "loss": 1.5943, + "step": 3802 + }, + { + "epoch": 1.1775554860894029, + "grad_norm": 0.255859375, + "learning_rate": 0.00015962603806070146, + "loss": 1.6764, + "step": 3803 + }, + { + "epoch": 1.1778680837761801, + "grad_norm": 0.263671875, + "learning_rate": 0.0001596063112286839, + "loss": 1.6969, + "step": 3804 + }, + { + "epoch": 1.1781806814629572, + "grad_norm": 0.271484375, + "learning_rate": 0.00015958658079815737, + "loss": 1.4794, + "step": 3805 + }, + { + "epoch": 1.1784932791497342, + "grad_norm": 0.255859375, + "learning_rate": 0.00015956684677031303, + "loss": 1.6924, + "step": 3806 + }, + { + "epoch": 1.1788058768365115, + "grad_norm": 0.2578125, + "learning_rate": 0.00015954710914634226, + "loss": 1.487, + "step": 3807 + }, + { + "epoch": 1.1791184745232886, + "grad_norm": 0.26171875, + "learning_rate": 0.0001595273679274366, + "loss": 1.7106, + "step": 3808 + }, + { + "epoch": 1.1794310722100656, + "grad_norm": 0.265625, + "learning_rate": 0.0001595076231147879, + "loss": 1.5495, + "step": 3809 + }, + { + "epoch": 1.1797436698968427, + "grad_norm": 0.2578125, + "learning_rate": 0.00015948787470958817, + "loss": 1.7602, + "step": 3810 + }, + { + "epoch": 1.18005626758362, + "grad_norm": 0.328125, + "learning_rate": 0.0001594681227130296, + "loss": 2.4393, + "step": 3811 + }, + { + "epoch": 1.180368865270397, + "grad_norm": 0.267578125, + "learning_rate": 0.00015944836712630472, + "loss": 1.4862, + "step": 3812 + }, + { + "epoch": 1.180681462957174, + "grad_norm": 0.271484375, + "learning_rate": 0.00015942860795060618, + "loss": 1.5807, + "step": 3813 + }, + { + "epoch": 1.1809940606439513, + "grad_norm": 0.267578125, + "learning_rate": 0.00015940884518712676, + "loss": 1.6587, + "step": 3814 + }, + { + "epoch": 1.1813066583307283, + "grad_norm": 0.259765625, + "learning_rate": 0.00015938907883705973, + "loss": 1.6634, + "step": 3815 + }, + { + "epoch": 1.1816192560175054, + "grad_norm": 0.263671875, + "learning_rate": 0.0001593693089015983, + "loss": 1.4222, + "step": 3816 + }, + { + "epoch": 1.1819318537042827, + "grad_norm": 0.271484375, + "learning_rate": 0.00015934953538193603, + "loss": 1.5701, + "step": 3817 + }, + { + "epoch": 1.1822444513910597, + "grad_norm": 0.259765625, + "learning_rate": 0.0001593297582792667, + "loss": 1.5497, + "step": 3818 + }, + { + "epoch": 1.1825570490778368, + "grad_norm": 0.25390625, + "learning_rate": 0.00015930997759478426, + "loss": 1.4805, + "step": 3819 + }, + { + "epoch": 1.182869646764614, + "grad_norm": 0.265625, + "learning_rate": 0.00015929019332968286, + "loss": 1.6392, + "step": 3820 + }, + { + "epoch": 1.183182244451391, + "grad_norm": 0.287109375, + "learning_rate": 0.00015927040548515696, + "loss": 1.6403, + "step": 3821 + }, + { + "epoch": 1.1834948421381681, + "grad_norm": 0.259765625, + "learning_rate": 0.00015925061406240116, + "loss": 1.5835, + "step": 3822 + }, + { + "epoch": 1.1838074398249452, + "grad_norm": 0.287109375, + "learning_rate": 0.00015923081906261025, + "loss": 1.5995, + "step": 3823 + }, + { + "epoch": 1.1841200375117225, + "grad_norm": 0.26171875, + "learning_rate": 0.00015921102048697936, + "loss": 1.7164, + "step": 3824 + }, + { + "epoch": 1.1844326351984995, + "grad_norm": 0.271484375, + "learning_rate": 0.00015919121833670368, + "loss": 1.3208, + "step": 3825 + }, + { + "epoch": 1.1847452328852766, + "grad_norm": 0.26171875, + "learning_rate": 0.00015917141261297875, + "loss": 1.6998, + "step": 3826 + }, + { + "epoch": 1.1850578305720538, + "grad_norm": 0.259765625, + "learning_rate": 0.0001591516033170002, + "loss": 1.7813, + "step": 3827 + }, + { + "epoch": 1.1853704282588309, + "grad_norm": 0.2470703125, + "learning_rate": 0.000159131790449964, + "loss": 1.249, + "step": 3828 + }, + { + "epoch": 1.185683025945608, + "grad_norm": 0.259765625, + "learning_rate": 0.00015911197401306625, + "loss": 1.2939, + "step": 3829 + }, + { + "epoch": 1.1859956236323852, + "grad_norm": 0.251953125, + "learning_rate": 0.0001590921540075033, + "loss": 1.6855, + "step": 3830 + }, + { + "epoch": 1.1863082213191622, + "grad_norm": 0.26953125, + "learning_rate": 0.00015907233043447173, + "loss": 1.8501, + "step": 3831 + }, + { + "epoch": 1.1866208190059393, + "grad_norm": 0.263671875, + "learning_rate": 0.00015905250329516829, + "loss": 1.496, + "step": 3832 + }, + { + "epoch": 1.1869334166927166, + "grad_norm": 0.25390625, + "learning_rate": 0.00015903267259078995, + "loss": 1.6494, + "step": 3833 + }, + { + "epoch": 1.1872460143794936, + "grad_norm": 0.283203125, + "learning_rate": 0.00015901283832253397, + "loss": 1.6233, + "step": 3834 + }, + { + "epoch": 1.1875586120662707, + "grad_norm": 0.287109375, + "learning_rate": 0.00015899300049159772, + "loss": 1.8152, + "step": 3835 + }, + { + "epoch": 1.1878712097530477, + "grad_norm": 0.26953125, + "learning_rate": 0.00015897315909917887, + "loss": 1.652, + "step": 3836 + }, + { + "epoch": 1.188183807439825, + "grad_norm": 0.271484375, + "learning_rate": 0.00015895331414647523, + "loss": 1.4338, + "step": 3837 + }, + { + "epoch": 1.188496405126602, + "grad_norm": 0.259765625, + "learning_rate": 0.0001589334656346849, + "loss": 1.5069, + "step": 3838 + }, + { + "epoch": 1.188809002813379, + "grad_norm": 0.283203125, + "learning_rate": 0.00015891361356500618, + "loss": 1.5154, + "step": 3839 + }, + { + "epoch": 1.1891216005001564, + "grad_norm": 0.28125, + "learning_rate": 0.0001588937579386375, + "loss": 1.7832, + "step": 3840 + }, + { + "epoch": 1.1894341981869334, + "grad_norm": 0.263671875, + "learning_rate": 0.0001588738987567776, + "loss": 1.5498, + "step": 3841 + }, + { + "epoch": 1.1897467958737105, + "grad_norm": 0.271484375, + "learning_rate": 0.00015885403602062544, + "loss": 1.5846, + "step": 3842 + }, + { + "epoch": 1.1900593935604877, + "grad_norm": 0.265625, + "learning_rate": 0.00015883416973138013, + "loss": 1.6232, + "step": 3843 + }, + { + "epoch": 1.1903719912472648, + "grad_norm": 0.26171875, + "learning_rate": 0.00015881429989024096, + "loss": 1.7651, + "step": 3844 + }, + { + "epoch": 1.1906845889340418, + "grad_norm": 0.271484375, + "learning_rate": 0.0001587944264984076, + "loss": 1.3701, + "step": 3845 + }, + { + "epoch": 1.190997186620819, + "grad_norm": 0.259765625, + "learning_rate": 0.0001587745495570798, + "loss": 1.449, + "step": 3846 + }, + { + "epoch": 1.1913097843075962, + "grad_norm": 0.26171875, + "learning_rate": 0.00015875466906745752, + "loss": 1.3349, + "step": 3847 + }, + { + "epoch": 1.1916223819943732, + "grad_norm": 0.265625, + "learning_rate": 0.00015873478503074102, + "loss": 1.5907, + "step": 3848 + }, + { + "epoch": 1.1919349796811503, + "grad_norm": 0.26171875, + "learning_rate": 0.0001587148974481307, + "loss": 1.5187, + "step": 3849 + }, + { + "epoch": 1.1922475773679275, + "grad_norm": 0.275390625, + "learning_rate": 0.0001586950063208272, + "loss": 1.5337, + "step": 3850 + }, + { + "epoch": 1.1925601750547046, + "grad_norm": 0.25390625, + "learning_rate": 0.00015867511165003134, + "loss": 1.5069, + "step": 3851 + }, + { + "epoch": 1.1928727727414816, + "grad_norm": 0.275390625, + "learning_rate": 0.00015865521343694426, + "loss": 1.5727, + "step": 3852 + }, + { + "epoch": 1.193185370428259, + "grad_norm": 0.283203125, + "learning_rate": 0.00015863531168276718, + "loss": 1.7666, + "step": 3853 + }, + { + "epoch": 1.193497968115036, + "grad_norm": 0.28125, + "learning_rate": 0.00015861540638870163, + "loss": 1.82, + "step": 3854 + }, + { + "epoch": 1.193810565801813, + "grad_norm": 0.275390625, + "learning_rate": 0.0001585954975559493, + "loss": 1.6074, + "step": 3855 + }, + { + "epoch": 1.1941231634885903, + "grad_norm": 0.25390625, + "learning_rate": 0.00015857558518571208, + "loss": 1.38, + "step": 3856 + }, + { + "epoch": 1.1944357611753673, + "grad_norm": 0.271484375, + "learning_rate": 0.00015855566927919216, + "loss": 1.3888, + "step": 3857 + }, + { + "epoch": 1.1947483588621444, + "grad_norm": 0.259765625, + "learning_rate": 0.00015853574983759185, + "loss": 1.5808, + "step": 3858 + }, + { + "epoch": 1.1950609565489216, + "grad_norm": 0.283203125, + "learning_rate": 0.00015851582686211377, + "loss": 1.333, + "step": 3859 + }, + { + "epoch": 1.1953735542356987, + "grad_norm": 0.26953125, + "learning_rate": 0.00015849590035396064, + "loss": 1.389, + "step": 3860 + }, + { + "epoch": 1.1956861519224757, + "grad_norm": 0.26953125, + "learning_rate": 0.00015847597031433546, + "loss": 1.6015, + "step": 3861 + }, + { + "epoch": 1.1959987496092528, + "grad_norm": 0.265625, + "learning_rate": 0.00015845603674444144, + "loss": 1.5003, + "step": 3862 + }, + { + "epoch": 1.19631134729603, + "grad_norm": 0.2734375, + "learning_rate": 0.00015843609964548197, + "loss": 1.3325, + "step": 3863 + }, + { + "epoch": 1.196623944982807, + "grad_norm": 0.2734375, + "learning_rate": 0.0001584161590186607, + "loss": 1.5014, + "step": 3864 + }, + { + "epoch": 1.1969365426695842, + "grad_norm": 0.25, + "learning_rate": 0.00015839621486518147, + "loss": 1.5025, + "step": 3865 + }, + { + "epoch": 1.1972491403563614, + "grad_norm": 0.263671875, + "learning_rate": 0.00015837626718624836, + "loss": 1.509, + "step": 3866 + }, + { + "epoch": 1.1975617380431385, + "grad_norm": 0.267578125, + "learning_rate": 0.0001583563159830656, + "loss": 1.5751, + "step": 3867 + }, + { + "epoch": 1.1978743357299155, + "grad_norm": 0.28125, + "learning_rate": 0.00015833636125683767, + "loss": 1.7019, + "step": 3868 + }, + { + "epoch": 1.1981869334166928, + "grad_norm": 0.275390625, + "learning_rate": 0.00015831640300876927, + "loss": 1.2996, + "step": 3869 + }, + { + "epoch": 1.1984995311034699, + "grad_norm": 0.2734375, + "learning_rate": 0.0001582964412400653, + "loss": 1.8939, + "step": 3870 + }, + { + "epoch": 1.198812128790247, + "grad_norm": 0.27734375, + "learning_rate": 0.0001582764759519309, + "loss": 1.6502, + "step": 3871 + }, + { + "epoch": 1.1991247264770242, + "grad_norm": 0.265625, + "learning_rate": 0.0001582565071455714, + "loss": 1.5804, + "step": 3872 + }, + { + "epoch": 1.1994373241638012, + "grad_norm": 0.26171875, + "learning_rate": 0.0001582365348221923, + "loss": 1.4524, + "step": 3873 + }, + { + "epoch": 1.1997499218505783, + "grad_norm": 0.283203125, + "learning_rate": 0.0001582165589829994, + "loss": 1.5058, + "step": 3874 + }, + { + "epoch": 1.2000625195373553, + "grad_norm": 0.263671875, + "learning_rate": 0.00015819657962919863, + "loss": 1.4116, + "step": 3875 + }, + { + "epoch": 1.2003751172241326, + "grad_norm": 0.265625, + "learning_rate": 0.00015817659676199618, + "loss": 1.7421, + "step": 3876 + }, + { + "epoch": 1.2006877149109096, + "grad_norm": 0.26953125, + "learning_rate": 0.00015815661038259848, + "loss": 1.4993, + "step": 3877 + }, + { + "epoch": 1.2010003125976867, + "grad_norm": 0.2578125, + "learning_rate": 0.0001581366204922121, + "loss": 1.5964, + "step": 3878 + }, + { + "epoch": 1.201312910284464, + "grad_norm": 0.267578125, + "learning_rate": 0.00015811662709204382, + "loss": 1.6863, + "step": 3879 + }, + { + "epoch": 1.201625507971241, + "grad_norm": 0.259765625, + "learning_rate": 0.0001580966301833007, + "loss": 1.3213, + "step": 3880 + }, + { + "epoch": 1.201938105658018, + "grad_norm": 0.248046875, + "learning_rate": 0.00015807662976719005, + "loss": 1.5946, + "step": 3881 + }, + { + "epoch": 1.2022507033447953, + "grad_norm": 0.2734375, + "learning_rate": 0.00015805662584491922, + "loss": 1.5478, + "step": 3882 + }, + { + "epoch": 1.2025633010315724, + "grad_norm": 0.2734375, + "learning_rate": 0.0001580366184176959, + "loss": 1.4345, + "step": 3883 + }, + { + "epoch": 1.2028758987183494, + "grad_norm": 0.271484375, + "learning_rate": 0.00015801660748672794, + "loss": 1.3025, + "step": 3884 + }, + { + "epoch": 1.2031884964051267, + "grad_norm": 0.27734375, + "learning_rate": 0.00015799659305322348, + "loss": 1.5366, + "step": 3885 + }, + { + "epoch": 1.2035010940919038, + "grad_norm": 0.26953125, + "learning_rate": 0.0001579765751183908, + "loss": 1.5513, + "step": 3886 + }, + { + "epoch": 1.2038136917786808, + "grad_norm": 0.271484375, + "learning_rate": 0.00015795655368343838, + "loss": 1.4599, + "step": 3887 + }, + { + "epoch": 1.2041262894654579, + "grad_norm": 0.2490234375, + "learning_rate": 0.00015793652874957498, + "loss": 1.1852, + "step": 3888 + }, + { + "epoch": 1.2044388871522351, + "grad_norm": 0.279296875, + "learning_rate": 0.0001579165003180095, + "loss": 1.5249, + "step": 3889 + }, + { + "epoch": 1.2047514848390122, + "grad_norm": 0.263671875, + "learning_rate": 0.0001578964683899511, + "loss": 1.5092, + "step": 3890 + }, + { + "epoch": 1.2050640825257892, + "grad_norm": 0.283203125, + "learning_rate": 0.00015787643296660912, + "loss": 1.8863, + "step": 3891 + }, + { + "epoch": 1.2053766802125665, + "grad_norm": 0.2734375, + "learning_rate": 0.00015785639404919315, + "loss": 1.4129, + "step": 3892 + }, + { + "epoch": 1.2056892778993435, + "grad_norm": 0.255859375, + "learning_rate": 0.00015783635163891288, + "loss": 1.5204, + "step": 3893 + }, + { + "epoch": 1.2060018755861206, + "grad_norm": 0.263671875, + "learning_rate": 0.0001578163057369784, + "loss": 1.4376, + "step": 3894 + }, + { + "epoch": 1.2063144732728979, + "grad_norm": 0.265625, + "learning_rate": 0.0001577962563445999, + "loss": 1.3734, + "step": 3895 + }, + { + "epoch": 1.206627070959675, + "grad_norm": 0.267578125, + "learning_rate": 0.0001577762034629877, + "loss": 1.4842, + "step": 3896 + }, + { + "epoch": 1.206939668646452, + "grad_norm": 0.26171875, + "learning_rate": 0.00015775614709335253, + "loss": 1.9202, + "step": 3897 + }, + { + "epoch": 1.2072522663332292, + "grad_norm": 0.322265625, + "learning_rate": 0.0001577360872369051, + "loss": 2.3281, + "step": 3898 + }, + { + "epoch": 1.2075648640200063, + "grad_norm": 0.265625, + "learning_rate": 0.00015771602389485654, + "loss": 1.658, + "step": 3899 + }, + { + "epoch": 1.2078774617067833, + "grad_norm": 0.271484375, + "learning_rate": 0.00015769595706841807, + "loss": 1.6683, + "step": 3900 + }, + { + "epoch": 1.2081900593935604, + "grad_norm": 0.265625, + "learning_rate": 0.00015767588675880115, + "loss": 1.2917, + "step": 3901 + }, + { + "epoch": 1.2085026570803377, + "grad_norm": 0.263671875, + "learning_rate": 0.00015765581296721742, + "loss": 1.6233, + "step": 3902 + }, + { + "epoch": 1.2088152547671147, + "grad_norm": 0.25390625, + "learning_rate": 0.00015763573569487881, + "loss": 1.5035, + "step": 3903 + }, + { + "epoch": 1.2091278524538918, + "grad_norm": 0.2734375, + "learning_rate": 0.0001576156549429974, + "loss": 1.5276, + "step": 3904 + }, + { + "epoch": 1.209440450140669, + "grad_norm": 0.25390625, + "learning_rate": 0.00015759557071278547, + "loss": 1.6975, + "step": 3905 + }, + { + "epoch": 1.209753047827446, + "grad_norm": 0.26171875, + "learning_rate": 0.00015757548300545556, + "loss": 1.6439, + "step": 3906 + }, + { + "epoch": 1.2100656455142231, + "grad_norm": 0.26171875, + "learning_rate": 0.00015755539182222034, + "loss": 1.3377, + "step": 3907 + }, + { + "epoch": 1.2103782432010004, + "grad_norm": 0.26953125, + "learning_rate": 0.0001575352971642928, + "loss": 1.8499, + "step": 3908 + }, + { + "epoch": 1.2106908408877775, + "grad_norm": 0.2578125, + "learning_rate": 0.00015751519903288604, + "loss": 1.694, + "step": 3909 + }, + { + "epoch": 1.2110034385745545, + "grad_norm": 0.2578125, + "learning_rate": 0.00015749509742921341, + "loss": 1.6426, + "step": 3910 + }, + { + "epoch": 1.2113160362613318, + "grad_norm": 0.26953125, + "learning_rate": 0.00015747499235448852, + "loss": 1.4628, + "step": 3911 + }, + { + "epoch": 1.2116286339481088, + "grad_norm": 0.2734375, + "learning_rate": 0.00015745488380992505, + "loss": 1.3588, + "step": 3912 + }, + { + "epoch": 1.2119412316348859, + "grad_norm": 0.259765625, + "learning_rate": 0.00015743477179673709, + "loss": 1.5574, + "step": 3913 + }, + { + "epoch": 1.212253829321663, + "grad_norm": 0.26953125, + "learning_rate": 0.00015741465631613873, + "loss": 1.481, + "step": 3914 + }, + { + "epoch": 1.2125664270084402, + "grad_norm": 0.271484375, + "learning_rate": 0.0001573945373693444, + "loss": 1.7328, + "step": 3915 + }, + { + "epoch": 1.2128790246952172, + "grad_norm": 0.28515625, + "learning_rate": 0.00015737441495756871, + "loss": 1.4424, + "step": 3916 + }, + { + "epoch": 1.2131916223819943, + "grad_norm": 0.271484375, + "learning_rate": 0.00015735428908202645, + "loss": 1.5498, + "step": 3917 + }, + { + "epoch": 1.2135042200687716, + "grad_norm": 0.26171875, + "learning_rate": 0.0001573341597439327, + "loss": 1.673, + "step": 3918 + }, + { + "epoch": 1.2138168177555486, + "grad_norm": 0.279296875, + "learning_rate": 0.00015731402694450268, + "loss": 1.3998, + "step": 3919 + }, + { + "epoch": 1.2141294154423257, + "grad_norm": 0.267578125, + "learning_rate": 0.00015729389068495182, + "loss": 1.6412, + "step": 3920 + }, + { + "epoch": 1.214442013129103, + "grad_norm": 0.259765625, + "learning_rate": 0.00015727375096649576, + "loss": 1.4878, + "step": 3921 + }, + { + "epoch": 1.21475461081588, + "grad_norm": 0.283203125, + "learning_rate": 0.00015725360779035035, + "loss": 1.5505, + "step": 3922 + }, + { + "epoch": 1.215067208502657, + "grad_norm": 0.263671875, + "learning_rate": 0.0001572334611577317, + "loss": 1.6475, + "step": 3923 + }, + { + "epoch": 1.2153798061894343, + "grad_norm": 0.2734375, + "learning_rate": 0.000157213311069856, + "loss": 1.5155, + "step": 3924 + }, + { + "epoch": 1.2156924038762114, + "grad_norm": 0.263671875, + "learning_rate": 0.0001571931575279399, + "loss": 1.5424, + "step": 3925 + }, + { + "epoch": 1.2160050015629884, + "grad_norm": 0.263671875, + "learning_rate": 0.00015717300053319996, + "loss": 1.797, + "step": 3926 + }, + { + "epoch": 1.2163175992497655, + "grad_norm": 0.265625, + "learning_rate": 0.0001571528400868531, + "loss": 1.382, + "step": 3927 + }, + { + "epoch": 1.2166301969365427, + "grad_norm": 0.2578125, + "learning_rate": 0.0001571326761901165, + "loss": 1.3145, + "step": 3928 + }, + { + "epoch": 1.2169427946233198, + "grad_norm": 0.259765625, + "learning_rate": 0.0001571125088442074, + "loss": 1.5764, + "step": 3929 + }, + { + "epoch": 1.2172553923100968, + "grad_norm": 0.2578125, + "learning_rate": 0.00015709233805034337, + "loss": 1.4217, + "step": 3930 + }, + { + "epoch": 1.217567989996874, + "grad_norm": 0.259765625, + "learning_rate": 0.00015707216380974215, + "loss": 1.4045, + "step": 3931 + }, + { + "epoch": 1.2178805876836512, + "grad_norm": 0.26953125, + "learning_rate": 0.00015705198612362165, + "loss": 1.4157, + "step": 3932 + }, + { + "epoch": 1.2181931853704282, + "grad_norm": 0.275390625, + "learning_rate": 0.00015703180499320008, + "loss": 1.6023, + "step": 3933 + }, + { + "epoch": 1.2185057830572055, + "grad_norm": 0.263671875, + "learning_rate": 0.00015701162041969574, + "loss": 1.5164, + "step": 3934 + }, + { + "epoch": 1.2188183807439825, + "grad_norm": 0.275390625, + "learning_rate": 0.0001569914324043272, + "loss": 2.0203, + "step": 3935 + }, + { + "epoch": 1.2191309784307596, + "grad_norm": 0.2734375, + "learning_rate": 0.0001569712409483133, + "loss": 1.3004, + "step": 3936 + }, + { + "epoch": 1.2194435761175368, + "grad_norm": 0.267578125, + "learning_rate": 0.00015695104605287295, + "loss": 1.7227, + "step": 3937 + }, + { + "epoch": 1.219756173804314, + "grad_norm": 0.2578125, + "learning_rate": 0.0001569308477192254, + "loss": 1.5001, + "step": 3938 + }, + { + "epoch": 1.220068771491091, + "grad_norm": 0.26953125, + "learning_rate": 0.00015691064594859004, + "loss": 1.3581, + "step": 3939 + }, + { + "epoch": 1.220381369177868, + "grad_norm": 0.26953125, + "learning_rate": 0.00015689044074218645, + "loss": 1.7562, + "step": 3940 + }, + { + "epoch": 1.2206939668646453, + "grad_norm": 0.2734375, + "learning_rate": 0.00015687023210123443, + "loss": 1.8915, + "step": 3941 + }, + { + "epoch": 1.2210065645514223, + "grad_norm": 0.26171875, + "learning_rate": 0.00015685002002695407, + "loss": 1.4896, + "step": 3942 + }, + { + "epoch": 1.2213191622381994, + "grad_norm": 0.26953125, + "learning_rate": 0.00015682980452056552, + "loss": 1.3309, + "step": 3943 + }, + { + "epoch": 1.2216317599249766, + "grad_norm": 0.2578125, + "learning_rate": 0.0001568095855832893, + "loss": 1.673, + "step": 3944 + }, + { + "epoch": 1.2219443576117537, + "grad_norm": 0.267578125, + "learning_rate": 0.00015678936321634598, + "loss": 1.2337, + "step": 3945 + }, + { + "epoch": 1.2222569552985307, + "grad_norm": 0.283203125, + "learning_rate": 0.0001567691374209564, + "loss": 1.3578, + "step": 3946 + }, + { + "epoch": 1.222569552985308, + "grad_norm": 0.279296875, + "learning_rate": 0.00015674890819834168, + "loss": 1.561, + "step": 3947 + }, + { + "epoch": 1.222882150672085, + "grad_norm": 0.27734375, + "learning_rate": 0.00015672867554972306, + "loss": 1.3768, + "step": 3948 + }, + { + "epoch": 1.223194748358862, + "grad_norm": 0.271484375, + "learning_rate": 0.000156708439476322, + "loss": 1.6251, + "step": 3949 + }, + { + "epoch": 1.2235073460456394, + "grad_norm": 0.2734375, + "learning_rate": 0.0001566881999793602, + "loss": 1.3627, + "step": 3950 + }, + { + "epoch": 1.2238199437324164, + "grad_norm": 0.255859375, + "learning_rate": 0.0001566679570600595, + "loss": 1.6194, + "step": 3951 + }, + { + "epoch": 1.2241325414191935, + "grad_norm": 0.2734375, + "learning_rate": 0.00015664771071964207, + "loss": 1.4797, + "step": 3952 + }, + { + "epoch": 1.2244451391059705, + "grad_norm": 0.263671875, + "learning_rate": 0.0001566274609593301, + "loss": 1.5906, + "step": 3953 + }, + { + "epoch": 1.2247577367927478, + "grad_norm": 0.279296875, + "learning_rate": 0.00015660720778034616, + "loss": 1.4935, + "step": 3954 + }, + { + "epoch": 1.2250703344795248, + "grad_norm": 0.26171875, + "learning_rate": 0.000156586951183913, + "loss": 1.5746, + "step": 3955 + }, + { + "epoch": 1.225382932166302, + "grad_norm": 0.28515625, + "learning_rate": 0.00015656669117125344, + "loss": 1.7911, + "step": 3956 + }, + { + "epoch": 1.2256955298530792, + "grad_norm": 0.26171875, + "learning_rate": 0.00015654642774359068, + "loss": 1.5117, + "step": 3957 + }, + { + "epoch": 1.2260081275398562, + "grad_norm": 0.2734375, + "learning_rate": 0.000156526160902148, + "loss": 1.6573, + "step": 3958 + }, + { + "epoch": 1.2263207252266333, + "grad_norm": 0.263671875, + "learning_rate": 0.00015650589064814896, + "loss": 1.5288, + "step": 3959 + }, + { + "epoch": 1.2266333229134103, + "grad_norm": 0.279296875, + "learning_rate": 0.00015648561698281728, + "loss": 1.458, + "step": 3960 + }, + { + "epoch": 1.2269459206001876, + "grad_norm": 0.265625, + "learning_rate": 0.00015646533990737696, + "loss": 1.2534, + "step": 3961 + }, + { + "epoch": 1.2272585182869646, + "grad_norm": 0.25390625, + "learning_rate": 0.00015644505942305207, + "loss": 1.6487, + "step": 3962 + }, + { + "epoch": 1.227571115973742, + "grad_norm": 0.26171875, + "learning_rate": 0.00015642477553106702, + "loss": 1.637, + "step": 3963 + }, + { + "epoch": 1.227883713660519, + "grad_norm": 0.263671875, + "learning_rate": 0.00015640448823264638, + "loss": 1.6421, + "step": 3964 + }, + { + "epoch": 1.228196311347296, + "grad_norm": 0.2734375, + "learning_rate": 0.00015638419752901493, + "loss": 1.4292, + "step": 3965 + }, + { + "epoch": 1.228508909034073, + "grad_norm": 0.2578125, + "learning_rate": 0.0001563639034213976, + "loss": 1.542, + "step": 3966 + }, + { + "epoch": 1.2288215067208503, + "grad_norm": 0.267578125, + "learning_rate": 0.0001563436059110196, + "loss": 1.4943, + "step": 3967 + }, + { + "epoch": 1.2291341044076274, + "grad_norm": 0.2578125, + "learning_rate": 0.00015632330499910633, + "loss": 1.4636, + "step": 3968 + }, + { + "epoch": 1.2294467020944044, + "grad_norm": 0.2578125, + "learning_rate": 0.00015630300068688333, + "loss": 1.5381, + "step": 3969 + }, + { + "epoch": 1.2297592997811817, + "grad_norm": 0.271484375, + "learning_rate": 0.00015628269297557646, + "loss": 1.4995, + "step": 3970 + }, + { + "epoch": 1.2300718974679588, + "grad_norm": 0.267578125, + "learning_rate": 0.00015626238186641168, + "loss": 1.4239, + "step": 3971 + }, + { + "epoch": 1.2303844951547358, + "grad_norm": 0.271484375, + "learning_rate": 0.0001562420673606152, + "loss": 1.633, + "step": 3972 + }, + { + "epoch": 1.2306970928415129, + "grad_norm": 0.29296875, + "learning_rate": 0.00015622174945941346, + "loss": 1.6459, + "step": 3973 + }, + { + "epoch": 1.2310096905282901, + "grad_norm": 0.279296875, + "learning_rate": 0.00015620142816403308, + "loss": 1.6795, + "step": 3974 + }, + { + "epoch": 1.2313222882150672, + "grad_norm": 0.279296875, + "learning_rate": 0.0001561811034757008, + "loss": 1.4258, + "step": 3975 + }, + { + "epoch": 1.2316348859018444, + "grad_norm": 0.2734375, + "learning_rate": 0.00015616077539564377, + "loss": 1.66, + "step": 3976 + }, + { + "epoch": 1.2319474835886215, + "grad_norm": 0.275390625, + "learning_rate": 0.00015614044392508913, + "loss": 1.5547, + "step": 3977 + }, + { + "epoch": 1.2322600812753985, + "grad_norm": 0.26953125, + "learning_rate": 0.00015612010906526438, + "loss": 1.6773, + "step": 3978 + }, + { + "epoch": 1.2325726789621756, + "grad_norm": 0.255859375, + "learning_rate": 0.00015609977081739712, + "loss": 1.2064, + "step": 3979 + }, + { + "epoch": 1.2328852766489529, + "grad_norm": 0.27734375, + "learning_rate": 0.00015607942918271519, + "loss": 1.4814, + "step": 3980 + }, + { + "epoch": 1.23319787433573, + "grad_norm": 0.259765625, + "learning_rate": 0.00015605908416244666, + "loss": 1.7813, + "step": 3981 + }, + { + "epoch": 1.233510472022507, + "grad_norm": 0.26953125, + "learning_rate": 0.00015603873575781977, + "loss": 1.5441, + "step": 3982 + }, + { + "epoch": 1.2338230697092842, + "grad_norm": 0.267578125, + "learning_rate": 0.00015601838397006303, + "loss": 1.7521, + "step": 3983 + }, + { + "epoch": 1.2341356673960613, + "grad_norm": 0.271484375, + "learning_rate": 0.00015599802880040503, + "loss": 1.4041, + "step": 3984 + }, + { + "epoch": 1.2344482650828383, + "grad_norm": 0.259765625, + "learning_rate": 0.00015597767025007472, + "loss": 1.814, + "step": 3985 + }, + { + "epoch": 1.2347608627696154, + "grad_norm": 0.265625, + "learning_rate": 0.00015595730832030106, + "loss": 1.6398, + "step": 3986 + }, + { + "epoch": 1.2350734604563927, + "grad_norm": 0.2734375, + "learning_rate": 0.00015593694301231347, + "loss": 1.5699, + "step": 3987 + }, + { + "epoch": 1.2353860581431697, + "grad_norm": 0.267578125, + "learning_rate": 0.00015591657432734128, + "loss": 1.526, + "step": 3988 + }, + { + "epoch": 1.235698655829947, + "grad_norm": 0.26953125, + "learning_rate": 0.00015589620226661425, + "loss": 1.6643, + "step": 3989 + }, + { + "epoch": 1.236011253516724, + "grad_norm": 0.267578125, + "learning_rate": 0.0001558758268313623, + "loss": 1.6431, + "step": 3990 + }, + { + "epoch": 1.236323851203501, + "grad_norm": 0.279296875, + "learning_rate": 0.00015585544802281545, + "loss": 1.4348, + "step": 3991 + }, + { + "epoch": 1.2366364488902781, + "grad_norm": 0.263671875, + "learning_rate": 0.00015583506584220403, + "loss": 1.6411, + "step": 3992 + }, + { + "epoch": 1.2369490465770554, + "grad_norm": 0.287109375, + "learning_rate": 0.00015581468029075854, + "loss": 1.6184, + "step": 3993 + }, + { + "epoch": 1.2372616442638324, + "grad_norm": 0.27734375, + "learning_rate": 0.00015579429136970967, + "loss": 1.5082, + "step": 3994 + }, + { + "epoch": 1.2375742419506095, + "grad_norm": 0.279296875, + "learning_rate": 0.00015577389908028836, + "loss": 1.5839, + "step": 3995 + }, + { + "epoch": 1.2378868396373868, + "grad_norm": 0.267578125, + "learning_rate": 0.00015575350342372562, + "loss": 1.5707, + "step": 3996 + }, + { + "epoch": 1.2381994373241638, + "grad_norm": 0.28515625, + "learning_rate": 0.00015573310440125288, + "loss": 1.7643, + "step": 3997 + }, + { + "epoch": 1.2385120350109409, + "grad_norm": 0.26953125, + "learning_rate": 0.0001557127020141016, + "loss": 1.4169, + "step": 3998 + }, + { + "epoch": 1.238824632697718, + "grad_norm": 0.26953125, + "learning_rate": 0.0001556922962635035, + "loss": 1.6456, + "step": 3999 + }, + { + "epoch": 1.2391372303844952, + "grad_norm": 0.265625, + "learning_rate": 0.0001556718871506905, + "loss": 1.8587, + "step": 4000 + }, + { + "epoch": 1.2394498280712722, + "grad_norm": 0.27734375, + "learning_rate": 0.00015565147467689477, + "loss": 1.7377, + "step": 4001 + }, + { + "epoch": 1.2397624257580495, + "grad_norm": 0.2734375, + "learning_rate": 0.00015563105884334853, + "loss": 1.7558, + "step": 4002 + }, + { + "epoch": 1.2400750234448266, + "grad_norm": 0.275390625, + "learning_rate": 0.00015561063965128442, + "loss": 1.4029, + "step": 4003 + }, + { + "epoch": 1.2403876211316036, + "grad_norm": 0.26171875, + "learning_rate": 0.0001555902171019351, + "loss": 1.7066, + "step": 4004 + }, + { + "epoch": 1.2407002188183807, + "grad_norm": 0.287109375, + "learning_rate": 0.00015556979119653357, + "loss": 1.6246, + "step": 4005 + }, + { + "epoch": 1.241012816505158, + "grad_norm": 0.255859375, + "learning_rate": 0.00015554936193631292, + "loss": 1.3696, + "step": 4006 + }, + { + "epoch": 1.241325414191935, + "grad_norm": 0.255859375, + "learning_rate": 0.0001555289293225065, + "loss": 1.4733, + "step": 4007 + }, + { + "epoch": 1.241638011878712, + "grad_norm": 0.267578125, + "learning_rate": 0.00015550849335634786, + "loss": 1.6661, + "step": 4008 + }, + { + "epoch": 1.2419506095654893, + "grad_norm": 0.26171875, + "learning_rate": 0.00015548805403907073, + "loss": 1.3743, + "step": 4009 + }, + { + "epoch": 1.2422632072522664, + "grad_norm": 0.28515625, + "learning_rate": 0.00015546761137190905, + "loss": 1.4139, + "step": 4010 + }, + { + "epoch": 1.2425758049390434, + "grad_norm": 0.26171875, + "learning_rate": 0.000155447165356097, + "loss": 1.7938, + "step": 4011 + }, + { + "epoch": 1.2428884026258205, + "grad_norm": 0.28515625, + "learning_rate": 0.00015542671599286893, + "loss": 1.4203, + "step": 4012 + }, + { + "epoch": 1.2432010003125977, + "grad_norm": 0.267578125, + "learning_rate": 0.0001554062632834594, + "loss": 1.5658, + "step": 4013 + }, + { + "epoch": 1.2435135979993748, + "grad_norm": 0.28125, + "learning_rate": 0.00015538580722910313, + "loss": 1.726, + "step": 4014 + }, + { + "epoch": 1.2438261956861518, + "grad_norm": 0.259765625, + "learning_rate": 0.0001553653478310351, + "loss": 1.3631, + "step": 4015 + }, + { + "epoch": 1.244138793372929, + "grad_norm": 0.275390625, + "learning_rate": 0.00015534488509049048, + "loss": 1.6755, + "step": 4016 + }, + { + "epoch": 1.2444513910597061, + "grad_norm": 0.275390625, + "learning_rate": 0.0001553244190087046, + "loss": 1.4839, + "step": 4017 + }, + { + "epoch": 1.2447639887464832, + "grad_norm": 0.265625, + "learning_rate": 0.0001553039495869131, + "loss": 1.6007, + "step": 4018 + }, + { + "epoch": 1.2450765864332605, + "grad_norm": 0.275390625, + "learning_rate": 0.00015528347682635163, + "loss": 1.6159, + "step": 4019 + }, + { + "epoch": 1.2453891841200375, + "grad_norm": 0.26171875, + "learning_rate": 0.0001552630007282562, + "loss": 1.4538, + "step": 4020 + }, + { + "epoch": 1.2457017818068146, + "grad_norm": 0.2734375, + "learning_rate": 0.00015524252129386302, + "loss": 1.4477, + "step": 4021 + }, + { + "epoch": 1.2460143794935918, + "grad_norm": 0.275390625, + "learning_rate": 0.00015522203852440843, + "loss": 1.6675, + "step": 4022 + }, + { + "epoch": 1.246326977180369, + "grad_norm": 0.25390625, + "learning_rate": 0.00015520155242112904, + "loss": 1.5112, + "step": 4023 + }, + { + "epoch": 1.246639574867146, + "grad_norm": 0.263671875, + "learning_rate": 0.00015518106298526157, + "loss": 1.4082, + "step": 4024 + }, + { + "epoch": 1.246952172553923, + "grad_norm": 0.2734375, + "learning_rate": 0.000155160570218043, + "loss": 1.5276, + "step": 4025 + }, + { + "epoch": 1.2472647702407003, + "grad_norm": 0.267578125, + "learning_rate": 0.00015514007412071053, + "loss": 1.597, + "step": 4026 + }, + { + "epoch": 1.2475773679274773, + "grad_norm": 0.2734375, + "learning_rate": 0.00015511957469450146, + "loss": 1.5105, + "step": 4027 + }, + { + "epoch": 1.2478899656142544, + "grad_norm": 0.279296875, + "learning_rate": 0.0001550990719406535, + "loss": 1.4222, + "step": 4028 + }, + { + "epoch": 1.2482025633010316, + "grad_norm": 0.251953125, + "learning_rate": 0.0001550785658604043, + "loss": 1.6306, + "step": 4029 + }, + { + "epoch": 1.2485151609878087, + "grad_norm": 0.263671875, + "learning_rate": 0.00015505805645499193, + "loss": 1.3462, + "step": 4030 + }, + { + "epoch": 1.2488277586745857, + "grad_norm": 0.263671875, + "learning_rate": 0.00015503754372565452, + "loss": 1.5608, + "step": 4031 + }, + { + "epoch": 1.249140356361363, + "grad_norm": 0.279296875, + "learning_rate": 0.00015501702767363045, + "loss": 1.4008, + "step": 4032 + }, + { + "epoch": 1.24945295404814, + "grad_norm": 0.271484375, + "learning_rate": 0.0001549965083001583, + "loss": 1.6971, + "step": 4033 + }, + { + "epoch": 1.249765551734917, + "grad_norm": 0.265625, + "learning_rate": 0.00015497598560647687, + "loss": 1.3958, + "step": 4034 + }, + { + "epoch": 1.2500781494216944, + "grad_norm": 0.265625, + "learning_rate": 0.00015495545959382512, + "loss": 1.6028, + "step": 4035 + }, + { + "epoch": 1.2503907471084714, + "grad_norm": 0.26953125, + "learning_rate": 0.0001549349302634423, + "loss": 1.578, + "step": 4036 + }, + { + "epoch": 1.2507033447952485, + "grad_norm": 0.283203125, + "learning_rate": 0.0001549143976165677, + "loss": 1.6286, + "step": 4037 + }, + { + "epoch": 1.2510159424820255, + "grad_norm": 0.265625, + "learning_rate": 0.00015489386165444094, + "loss": 1.6549, + "step": 4038 + }, + { + "epoch": 1.2513285401688028, + "grad_norm": 0.314453125, + "learning_rate": 0.0001548733223783018, + "loss": 2.2127, + "step": 4039 + }, + { + "epoch": 1.2516411378555798, + "grad_norm": 0.259765625, + "learning_rate": 0.00015485277978939026, + "loss": 1.9045, + "step": 4040 + }, + { + "epoch": 1.2519537355423571, + "grad_norm": 0.263671875, + "learning_rate": 0.00015483223388894647, + "loss": 1.4649, + "step": 4041 + }, + { + "epoch": 1.2522663332291342, + "grad_norm": 0.28515625, + "learning_rate": 0.00015481168467821092, + "loss": 1.5532, + "step": 4042 + }, + { + "epoch": 1.2525789309159112, + "grad_norm": 0.27734375, + "learning_rate": 0.00015479113215842406, + "loss": 1.4651, + "step": 4043 + }, + { + "epoch": 1.2528915286026883, + "grad_norm": 0.271484375, + "learning_rate": 0.00015477057633082674, + "loss": 1.9448, + "step": 4044 + }, + { + "epoch": 1.2532041262894655, + "grad_norm": 0.263671875, + "learning_rate": 0.00015475001719665997, + "loss": 1.6166, + "step": 4045 + }, + { + "epoch": 1.2535167239762426, + "grad_norm": 0.259765625, + "learning_rate": 0.00015472945475716486, + "loss": 1.6965, + "step": 4046 + }, + { + "epoch": 1.2538293216630196, + "grad_norm": 0.2890625, + "learning_rate": 0.0001547088890135828, + "loss": 1.4891, + "step": 4047 + }, + { + "epoch": 1.254141919349797, + "grad_norm": 0.271484375, + "learning_rate": 0.00015468831996715544, + "loss": 1.332, + "step": 4048 + }, + { + "epoch": 1.254454517036574, + "grad_norm": 0.28515625, + "learning_rate": 0.0001546677476191245, + "loss": 1.8816, + "step": 4049 + }, + { + "epoch": 1.254767114723351, + "grad_norm": 0.271484375, + "learning_rate": 0.00015464717197073195, + "loss": 1.9537, + "step": 4050 + }, + { + "epoch": 1.255079712410128, + "grad_norm": 0.271484375, + "learning_rate": 0.00015462659302322001, + "loss": 1.6064, + "step": 4051 + }, + { + "epoch": 1.2553923100969053, + "grad_norm": 0.294921875, + "learning_rate": 0.00015460601077783102, + "loss": 1.8206, + "step": 4052 + }, + { + "epoch": 1.2557049077836824, + "grad_norm": 0.271484375, + "learning_rate": 0.0001545854252358076, + "loss": 1.5358, + "step": 4053 + }, + { + "epoch": 1.2560175054704596, + "grad_norm": 0.27734375, + "learning_rate": 0.00015456483639839251, + "loss": 1.7593, + "step": 4054 + }, + { + "epoch": 1.2563301031572367, + "grad_norm": 0.275390625, + "learning_rate": 0.0001545442442668287, + "loss": 1.5966, + "step": 4055 + }, + { + "epoch": 1.2566427008440137, + "grad_norm": 0.26953125, + "learning_rate": 0.00015452364884235931, + "loss": 1.6445, + "step": 4056 + }, + { + "epoch": 1.2569552985307908, + "grad_norm": 0.26171875, + "learning_rate": 0.00015450305012622783, + "loss": 1.6093, + "step": 4057 + }, + { + "epoch": 1.257267896217568, + "grad_norm": 0.26953125, + "learning_rate": 0.00015448244811967773, + "loss": 1.337, + "step": 4058 + }, + { + "epoch": 1.2575804939043451, + "grad_norm": 0.2578125, + "learning_rate": 0.00015446184282395282, + "loss": 1.3599, + "step": 4059 + }, + { + "epoch": 1.2578930915911222, + "grad_norm": 0.275390625, + "learning_rate": 0.00015444123424029703, + "loss": 1.6384, + "step": 4060 + }, + { + "epoch": 1.2582056892778994, + "grad_norm": 0.263671875, + "learning_rate": 0.0001544206223699546, + "loss": 1.456, + "step": 4061 + }, + { + "epoch": 1.2585182869646765, + "grad_norm": 0.2734375, + "learning_rate": 0.0001544000072141698, + "loss": 1.5647, + "step": 4062 + }, + { + "epoch": 1.2588308846514535, + "grad_norm": 0.259765625, + "learning_rate": 0.00015437938877418725, + "loss": 1.326, + "step": 4063 + }, + { + "epoch": 1.2591434823382306, + "grad_norm": 0.271484375, + "learning_rate": 0.00015435876705125173, + "loss": 1.4247, + "step": 4064 + }, + { + "epoch": 1.2594560800250079, + "grad_norm": 0.267578125, + "learning_rate": 0.00015433814204660816, + "loss": 1.6699, + "step": 4065 + }, + { + "epoch": 1.259768677711785, + "grad_norm": 0.267578125, + "learning_rate": 0.0001543175137615017, + "loss": 1.4876, + "step": 4066 + }, + { + "epoch": 1.2600812753985622, + "grad_norm": 0.302734375, + "learning_rate": 0.00015429688219717772, + "loss": 1.3906, + "step": 4067 + }, + { + "epoch": 1.2603938730853392, + "grad_norm": 0.265625, + "learning_rate": 0.0001542762473548818, + "loss": 1.5093, + "step": 4068 + }, + { + "epoch": 1.2607064707721163, + "grad_norm": 0.27734375, + "learning_rate": 0.00015425560923585963, + "loss": 1.4614, + "step": 4069 + }, + { + "epoch": 1.2610190684588933, + "grad_norm": 0.263671875, + "learning_rate": 0.0001542349678413572, + "loss": 1.6383, + "step": 4070 + }, + { + "epoch": 1.2613316661456706, + "grad_norm": 0.279296875, + "learning_rate": 0.00015421432317262065, + "loss": 1.3294, + "step": 4071 + }, + { + "epoch": 1.2616442638324477, + "grad_norm": 0.271484375, + "learning_rate": 0.00015419367523089635, + "loss": 1.4523, + "step": 4072 + }, + { + "epoch": 1.2619568615192247, + "grad_norm": 0.2890625, + "learning_rate": 0.00015417302401743075, + "loss": 1.793, + "step": 4073 + }, + { + "epoch": 1.262269459206002, + "grad_norm": 0.267578125, + "learning_rate": 0.0001541523695334707, + "loss": 1.5598, + "step": 4074 + }, + { + "epoch": 1.262582056892779, + "grad_norm": 0.26171875, + "learning_rate": 0.0001541317117802631, + "loss": 1.4749, + "step": 4075 + }, + { + "epoch": 1.262894654579556, + "grad_norm": 0.28125, + "learning_rate": 0.00015411105075905504, + "loss": 1.4417, + "step": 4076 + }, + { + "epoch": 1.2632072522663331, + "grad_norm": 0.279296875, + "learning_rate": 0.00015409038647109396, + "loss": 1.5956, + "step": 4077 + }, + { + "epoch": 1.2635198499531104, + "grad_norm": 0.26171875, + "learning_rate": 0.00015406971891762726, + "loss": 1.5047, + "step": 4078 + }, + { + "epoch": 1.2638324476398874, + "grad_norm": 0.279296875, + "learning_rate": 0.00015404904809990274, + "loss": 1.718, + "step": 4079 + }, + { + "epoch": 1.2641450453266647, + "grad_norm": 0.2890625, + "learning_rate": 0.00015402837401916835, + "loss": 1.5212, + "step": 4080 + }, + { + "epoch": 1.2644576430134418, + "grad_norm": 0.275390625, + "learning_rate": 0.00015400769667667214, + "loss": 1.564, + "step": 4081 + }, + { + "epoch": 1.2647702407002188, + "grad_norm": 0.275390625, + "learning_rate": 0.00015398701607366246, + "loss": 1.3742, + "step": 4082 + }, + { + "epoch": 1.2650828383869959, + "grad_norm": 0.271484375, + "learning_rate": 0.00015396633221138783, + "loss": 1.6599, + "step": 4083 + }, + { + "epoch": 1.265395436073773, + "grad_norm": 0.265625, + "learning_rate": 0.00015394564509109692, + "loss": 1.5583, + "step": 4084 + }, + { + "epoch": 1.2657080337605502, + "grad_norm": 0.275390625, + "learning_rate": 0.00015392495471403873, + "loss": 1.5196, + "step": 4085 + }, + { + "epoch": 1.2660206314473272, + "grad_norm": 0.279296875, + "learning_rate": 0.00015390426108146226, + "loss": 1.5672, + "step": 4086 + }, + { + "epoch": 1.2663332291341045, + "grad_norm": 0.271484375, + "learning_rate": 0.00015388356419461686, + "loss": 1.3556, + "step": 4087 + }, + { + "epoch": 1.2666458268208816, + "grad_norm": 0.26953125, + "learning_rate": 0.00015386286405475206, + "loss": 1.9784, + "step": 4088 + }, + { + "epoch": 1.2669584245076586, + "grad_norm": 0.2734375, + "learning_rate": 0.00015384216066311753, + "loss": 1.4852, + "step": 4089 + }, + { + "epoch": 1.2672710221944357, + "grad_norm": 0.26171875, + "learning_rate": 0.00015382145402096308, + "loss": 1.5272, + "step": 4090 + }, + { + "epoch": 1.267583619881213, + "grad_norm": 0.267578125, + "learning_rate": 0.0001538007441295389, + "loss": 1.5805, + "step": 4091 + }, + { + "epoch": 1.26789621756799, + "grad_norm": 0.26171875, + "learning_rate": 0.00015378003099009524, + "loss": 1.4972, + "step": 4092 + }, + { + "epoch": 1.2682088152547673, + "grad_norm": 0.259765625, + "learning_rate": 0.00015375931460388258, + "loss": 1.7508, + "step": 4093 + }, + { + "epoch": 1.2685214129415443, + "grad_norm": 0.2734375, + "learning_rate": 0.0001537385949721516, + "loss": 1.477, + "step": 4094 + }, + { + "epoch": 1.2688340106283214, + "grad_norm": 0.2734375, + "learning_rate": 0.00015371787209615312, + "loss": 1.69, + "step": 4095 + }, + { + "epoch": 1.2691466083150984, + "grad_norm": 0.298828125, + "learning_rate": 0.0001536971459771383, + "loss": 1.5547, + "step": 4096 + }, + { + "epoch": 1.2694592060018755, + "grad_norm": 0.2578125, + "learning_rate": 0.00015367641661635833, + "loss": 1.4637, + "step": 4097 + }, + { + "epoch": 1.2697718036886527, + "grad_norm": 0.2734375, + "learning_rate": 0.0001536556840150647, + "loss": 1.4967, + "step": 4098 + }, + { + "epoch": 1.2700844013754298, + "grad_norm": 0.251953125, + "learning_rate": 0.000153634948174509, + "loss": 1.4932, + "step": 4099 + }, + { + "epoch": 1.270396999062207, + "grad_norm": 0.28125, + "learning_rate": 0.0001536142090959432, + "loss": 1.7785, + "step": 4100 + }, + { + "epoch": 1.270709596748984, + "grad_norm": 0.25390625, + "learning_rate": 0.00015359346678061925, + "loss": 1.6011, + "step": 4101 + }, + { + "epoch": 1.2710221944357611, + "grad_norm": 0.248046875, + "learning_rate": 0.00015357272122978942, + "loss": 1.3854, + "step": 4102 + }, + { + "epoch": 1.2713347921225382, + "grad_norm": 0.28125, + "learning_rate": 0.00015355197244470612, + "loss": 1.7274, + "step": 4103 + }, + { + "epoch": 1.2716473898093155, + "grad_norm": 0.279296875, + "learning_rate": 0.00015353122042662201, + "loss": 1.3538, + "step": 4104 + }, + { + "epoch": 1.2719599874960925, + "grad_norm": 0.2734375, + "learning_rate": 0.0001535104651767899, + "loss": 1.4942, + "step": 4105 + }, + { + "epoch": 1.2722725851828696, + "grad_norm": 0.27734375, + "learning_rate": 0.00015348970669646286, + "loss": 1.4381, + "step": 4106 + }, + { + "epoch": 1.2725851828696468, + "grad_norm": 0.275390625, + "learning_rate": 0.00015346894498689402, + "loss": 1.7603, + "step": 4107 + }, + { + "epoch": 1.2728977805564239, + "grad_norm": 0.255859375, + "learning_rate": 0.00015344818004933686, + "loss": 1.6616, + "step": 4108 + }, + { + "epoch": 1.273210378243201, + "grad_norm": 0.279296875, + "learning_rate": 0.00015342741188504496, + "loss": 1.5816, + "step": 4109 + }, + { + "epoch": 1.273522975929978, + "grad_norm": 0.279296875, + "learning_rate": 0.00015340664049527208, + "loss": 1.5475, + "step": 4110 + }, + { + "epoch": 1.2738355736167553, + "grad_norm": 0.267578125, + "learning_rate": 0.0001533858658812723, + "loss": 1.3622, + "step": 4111 + }, + { + "epoch": 1.2741481713035323, + "grad_norm": 0.263671875, + "learning_rate": 0.00015336508804429975, + "loss": 1.608, + "step": 4112 + }, + { + "epoch": 1.2744607689903096, + "grad_norm": 0.255859375, + "learning_rate": 0.00015334430698560884, + "loss": 1.5008, + "step": 4113 + }, + { + "epoch": 1.2747733666770866, + "grad_norm": 0.265625, + "learning_rate": 0.00015332352270645412, + "loss": 1.6675, + "step": 4114 + }, + { + "epoch": 1.2750859643638637, + "grad_norm": 0.267578125, + "learning_rate": 0.00015330273520809042, + "loss": 1.5997, + "step": 4115 + }, + { + "epoch": 1.2753985620506407, + "grad_norm": 0.2734375, + "learning_rate": 0.00015328194449177262, + "loss": 1.7242, + "step": 4116 + }, + { + "epoch": 1.275711159737418, + "grad_norm": 0.26953125, + "learning_rate": 0.00015326115055875597, + "loss": 1.6441, + "step": 4117 + }, + { + "epoch": 1.276023757424195, + "grad_norm": 0.267578125, + "learning_rate": 0.00015324035341029578, + "loss": 1.5648, + "step": 4118 + }, + { + "epoch": 1.276336355110972, + "grad_norm": 0.26953125, + "learning_rate": 0.00015321955304764765, + "loss": 1.7804, + "step": 4119 + }, + { + "epoch": 1.2766489527977494, + "grad_norm": 0.259765625, + "learning_rate": 0.0001531987494720672, + "loss": 1.4064, + "step": 4120 + }, + { + "epoch": 1.2769615504845264, + "grad_norm": 0.291015625, + "learning_rate": 0.0001531779426848105, + "loss": 1.6266, + "step": 4121 + }, + { + "epoch": 1.2772741481713035, + "grad_norm": 0.28125, + "learning_rate": 0.00015315713268713364, + "loss": 1.5447, + "step": 4122 + }, + { + "epoch": 1.2775867458580805, + "grad_norm": 0.26953125, + "learning_rate": 0.00015313631948029292, + "loss": 1.3518, + "step": 4123 + }, + { + "epoch": 1.2778993435448578, + "grad_norm": 0.265625, + "learning_rate": 0.00015311550306554492, + "loss": 1.4158, + "step": 4124 + }, + { + "epoch": 1.2782119412316348, + "grad_norm": 0.275390625, + "learning_rate": 0.00015309468344414627, + "loss": 1.6997, + "step": 4125 + }, + { + "epoch": 1.2785245389184121, + "grad_norm": 0.259765625, + "learning_rate": 0.00015307386061735393, + "loss": 1.3474, + "step": 4126 + }, + { + "epoch": 1.2788371366051892, + "grad_norm": 0.255859375, + "learning_rate": 0.00015305303458642503, + "loss": 1.5881, + "step": 4127 + }, + { + "epoch": 1.2791497342919662, + "grad_norm": 0.275390625, + "learning_rate": 0.0001530322053526168, + "loss": 1.767, + "step": 4128 + }, + { + "epoch": 1.2794623319787433, + "grad_norm": 0.265625, + "learning_rate": 0.00015301137291718676, + "loss": 1.4263, + "step": 4129 + }, + { + "epoch": 1.2797749296655205, + "grad_norm": 0.259765625, + "learning_rate": 0.00015299053728139256, + "loss": 1.6506, + "step": 4130 + }, + { + "epoch": 1.2800875273522976, + "grad_norm": 0.271484375, + "learning_rate": 0.0001529696984464921, + "loss": 1.5171, + "step": 4131 + }, + { + "epoch": 1.2804001250390746, + "grad_norm": 0.275390625, + "learning_rate": 0.00015294885641374347, + "loss": 1.4701, + "step": 4132 + }, + { + "epoch": 1.280712722725852, + "grad_norm": 0.251953125, + "learning_rate": 0.0001529280111844049, + "loss": 1.5263, + "step": 4133 + }, + { + "epoch": 1.281025320412629, + "grad_norm": 0.2734375, + "learning_rate": 0.0001529071627597348, + "loss": 1.4748, + "step": 4134 + }, + { + "epoch": 1.281337918099406, + "grad_norm": 0.26953125, + "learning_rate": 0.00015288631114099196, + "loss": 1.5851, + "step": 4135 + }, + { + "epoch": 1.281650515786183, + "grad_norm": 0.267578125, + "learning_rate": 0.00015286545632943506, + "loss": 1.6182, + "step": 4136 + }, + { + "epoch": 1.2819631134729603, + "grad_norm": 0.271484375, + "learning_rate": 0.00015284459832632318, + "loss": 1.6872, + "step": 4137 + }, + { + "epoch": 1.2822757111597374, + "grad_norm": 0.26953125, + "learning_rate": 0.0001528237371329156, + "loss": 1.2909, + "step": 4138 + }, + { + "epoch": 1.2825883088465146, + "grad_norm": 0.271484375, + "learning_rate": 0.00015280287275047167, + "loss": 1.4174, + "step": 4139 + }, + { + "epoch": 1.2829009065332917, + "grad_norm": 0.27734375, + "learning_rate": 0.00015278200518025102, + "loss": 1.5288, + "step": 4140 + }, + { + "epoch": 1.2832135042200687, + "grad_norm": 0.26171875, + "learning_rate": 0.00015276113442351345, + "loss": 1.4028, + "step": 4141 + }, + { + "epoch": 1.2835261019068458, + "grad_norm": 0.259765625, + "learning_rate": 0.00015274026048151898, + "loss": 1.6801, + "step": 4142 + }, + { + "epoch": 1.283838699593623, + "grad_norm": 0.283203125, + "learning_rate": 0.0001527193833555278, + "loss": 1.6331, + "step": 4143 + }, + { + "epoch": 1.2841512972804001, + "grad_norm": 0.26953125, + "learning_rate": 0.00015269850304680023, + "loss": 1.4745, + "step": 4144 + }, + { + "epoch": 1.2844638949671772, + "grad_norm": 0.26171875, + "learning_rate": 0.00015267761955659688, + "loss": 1.4401, + "step": 4145 + }, + { + "epoch": 1.2847764926539544, + "grad_norm": 0.275390625, + "learning_rate": 0.0001526567328861785, + "loss": 1.408, + "step": 4146 + }, + { + "epoch": 1.2850890903407315, + "grad_norm": 0.275390625, + "learning_rate": 0.00015263584303680609, + "loss": 1.4133, + "step": 4147 + }, + { + "epoch": 1.2854016880275085, + "grad_norm": 0.27734375, + "learning_rate": 0.00015261495000974074, + "loss": 1.5989, + "step": 4148 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.271484375, + "learning_rate": 0.00015259405380624384, + "loss": 1.7035, + "step": 4149 + }, + { + "epoch": 1.2860268834010629, + "grad_norm": 0.263671875, + "learning_rate": 0.00015257315442757685, + "loss": 1.2579, + "step": 4150 + }, + { + "epoch": 1.28633948108784, + "grad_norm": 0.271484375, + "learning_rate": 0.00015255225187500154, + "loss": 1.3545, + "step": 4151 + }, + { + "epoch": 1.2866520787746172, + "grad_norm": 0.267578125, + "learning_rate": 0.00015253134614977979, + "loss": 1.6228, + "step": 4152 + }, + { + "epoch": 1.2869646764613942, + "grad_norm": 0.28125, + "learning_rate": 0.0001525104372531738, + "loss": 1.7938, + "step": 4153 + }, + { + "epoch": 1.2872772741481713, + "grad_norm": 0.2734375, + "learning_rate": 0.00015248952518644577, + "loss": 1.5676, + "step": 4154 + }, + { + "epoch": 1.2875898718349483, + "grad_norm": 0.29296875, + "learning_rate": 0.0001524686099508582, + "loss": 1.6865, + "step": 4155 + }, + { + "epoch": 1.2879024695217256, + "grad_norm": 0.283203125, + "learning_rate": 0.0001524476915476738, + "loss": 1.5875, + "step": 4156 + }, + { + "epoch": 1.2882150672085027, + "grad_norm": 0.265625, + "learning_rate": 0.00015242676997815542, + "loss": 1.5304, + "step": 4157 + }, + { + "epoch": 1.2885276648952797, + "grad_norm": 0.2734375, + "learning_rate": 0.00015240584524356613, + "loss": 1.5193, + "step": 4158 + }, + { + "epoch": 1.288840262582057, + "grad_norm": 0.275390625, + "learning_rate": 0.00015238491734516916, + "loss": 1.6902, + "step": 4159 + }, + { + "epoch": 1.289152860268834, + "grad_norm": 0.2734375, + "learning_rate": 0.000152363986284228, + "loss": 1.6487, + "step": 4160 + }, + { + "epoch": 1.289465457955611, + "grad_norm": 0.296875, + "learning_rate": 0.00015234305206200625, + "loss": 1.5843, + "step": 4161 + }, + { + "epoch": 1.2897780556423881, + "grad_norm": 0.27734375, + "learning_rate": 0.00015232211467976775, + "loss": 1.9194, + "step": 4162 + }, + { + "epoch": 1.2900906533291654, + "grad_norm": 0.267578125, + "learning_rate": 0.00015230117413877654, + "loss": 1.6171, + "step": 4163 + }, + { + "epoch": 1.2904032510159424, + "grad_norm": 0.3359375, + "learning_rate": 0.00015228023044029673, + "loss": 2.1398, + "step": 4164 + }, + { + "epoch": 1.2907158487027197, + "grad_norm": 0.296875, + "learning_rate": 0.00015225928358559285, + "loss": 1.7288, + "step": 4165 + }, + { + "epoch": 1.2910284463894968, + "grad_norm": 0.275390625, + "learning_rate": 0.0001522383335759294, + "loss": 1.4731, + "step": 4166 + }, + { + "epoch": 1.2913410440762738, + "grad_norm": 0.29296875, + "learning_rate": 0.0001522173804125712, + "loss": 1.8033, + "step": 4167 + }, + { + "epoch": 1.2916536417630509, + "grad_norm": 0.279296875, + "learning_rate": 0.00015219642409678317, + "loss": 1.4459, + "step": 4168 + }, + { + "epoch": 1.2919662394498281, + "grad_norm": 0.2734375, + "learning_rate": 0.0001521754646298305, + "loss": 1.7603, + "step": 4169 + }, + { + "epoch": 1.2922788371366052, + "grad_norm": 0.298828125, + "learning_rate": 0.0001521545020129786, + "loss": 1.405, + "step": 4170 + }, + { + "epoch": 1.2925914348233822, + "grad_norm": 0.2734375, + "learning_rate": 0.0001521335362474929, + "loss": 1.6413, + "step": 4171 + }, + { + "epoch": 1.2929040325101595, + "grad_norm": 0.287109375, + "learning_rate": 0.0001521125673346392, + "loss": 1.7767, + "step": 4172 + }, + { + "epoch": 1.2932166301969366, + "grad_norm": 0.275390625, + "learning_rate": 0.00015209159527568343, + "loss": 1.3952, + "step": 4173 + }, + { + "epoch": 1.2935292278837136, + "grad_norm": 0.2734375, + "learning_rate": 0.00015207062007189165, + "loss": 1.7186, + "step": 4174 + }, + { + "epoch": 1.2938418255704907, + "grad_norm": 0.27734375, + "learning_rate": 0.00015204964172453014, + "loss": 1.735, + "step": 4175 + }, + { + "epoch": 1.294154423257268, + "grad_norm": 0.259765625, + "learning_rate": 0.0001520286602348655, + "loss": 1.6331, + "step": 4176 + }, + { + "epoch": 1.294467020944045, + "grad_norm": 0.263671875, + "learning_rate": 0.0001520076756041643, + "loss": 1.2599, + "step": 4177 + }, + { + "epoch": 1.2947796186308222, + "grad_norm": 0.255859375, + "learning_rate": 0.00015198668783369346, + "loss": 1.5036, + "step": 4178 + }, + { + "epoch": 1.2950922163175993, + "grad_norm": 0.259765625, + "learning_rate": 0.00015196569692472005, + "loss": 1.5502, + "step": 4179 + }, + { + "epoch": 1.2954048140043763, + "grad_norm": 0.265625, + "learning_rate": 0.00015194470287851125, + "loss": 1.6561, + "step": 4180 + }, + { + "epoch": 1.2957174116911534, + "grad_norm": 0.2734375, + "learning_rate": 0.00015192370569633458, + "loss": 1.5461, + "step": 4181 + }, + { + "epoch": 1.2960300093779307, + "grad_norm": 0.2734375, + "learning_rate": 0.0001519027053794576, + "loss": 1.7504, + "step": 4182 + }, + { + "epoch": 1.2963426070647077, + "grad_norm": 0.283203125, + "learning_rate": 0.00015188170192914822, + "loss": 1.7119, + "step": 4183 + }, + { + "epoch": 1.2966552047514848, + "grad_norm": 0.275390625, + "learning_rate": 0.0001518606953466743, + "loss": 1.2634, + "step": 4184 + }, + { + "epoch": 1.296967802438262, + "grad_norm": 0.2734375, + "learning_rate": 0.00015183968563330414, + "loss": 1.4318, + "step": 4185 + }, + { + "epoch": 1.297280400125039, + "grad_norm": 0.271484375, + "learning_rate": 0.0001518186727903061, + "loss": 1.5242, + "step": 4186 + }, + { + "epoch": 1.2975929978118161, + "grad_norm": 0.26953125, + "learning_rate": 0.0001517976568189488, + "loss": 1.7578, + "step": 4187 + }, + { + "epoch": 1.2979055954985932, + "grad_norm": 0.271484375, + "learning_rate": 0.00015177663772050087, + "loss": 1.6491, + "step": 4188 + }, + { + "epoch": 1.2982181931853705, + "grad_norm": 0.275390625, + "learning_rate": 0.0001517556154962314, + "loss": 1.7881, + "step": 4189 + }, + { + "epoch": 1.2985307908721475, + "grad_norm": 0.28515625, + "learning_rate": 0.00015173459014740945, + "loss": 1.3904, + "step": 4190 + }, + { + "epoch": 1.2988433885589248, + "grad_norm": 0.29296875, + "learning_rate": 0.0001517135616753044, + "loss": 1.6517, + "step": 4191 + }, + { + "epoch": 1.2991559862457018, + "grad_norm": 0.3515625, + "learning_rate": 0.00015169253008118566, + "loss": 2.2789, + "step": 4192 + }, + { + "epoch": 1.2994685839324789, + "grad_norm": 0.2734375, + "learning_rate": 0.00015167149536632305, + "loss": 1.7229, + "step": 4193 + }, + { + "epoch": 1.299781181619256, + "grad_norm": 0.291015625, + "learning_rate": 0.00015165045753198642, + "loss": 1.6937, + "step": 4194 + }, + { + "epoch": 1.3000937793060332, + "grad_norm": 0.283203125, + "learning_rate": 0.00015162941657944585, + "loss": 1.3607, + "step": 4195 + }, + { + "epoch": 1.3004063769928103, + "grad_norm": 0.275390625, + "learning_rate": 0.0001516083725099716, + "loss": 1.5234, + "step": 4196 + }, + { + "epoch": 1.3007189746795873, + "grad_norm": 0.267578125, + "learning_rate": 0.00015158732532483414, + "loss": 1.6549, + "step": 4197 + }, + { + "epoch": 1.3010315723663646, + "grad_norm": 0.283203125, + "learning_rate": 0.0001515662750253041, + "loss": 1.5429, + "step": 4198 + }, + { + "epoch": 1.3013441700531416, + "grad_norm": 0.2734375, + "learning_rate": 0.00015154522161265236, + "loss": 1.5854, + "step": 4199 + }, + { + "epoch": 1.3016567677399187, + "grad_norm": 0.265625, + "learning_rate": 0.00015152416508814985, + "loss": 1.8248, + "step": 4200 + }, + { + "epoch": 1.3019693654266957, + "grad_norm": 0.271484375, + "learning_rate": 0.00015150310545306793, + "loss": 1.6711, + "step": 4201 + }, + { + "epoch": 1.302281963113473, + "grad_norm": 0.279296875, + "learning_rate": 0.00015148204270867783, + "loss": 1.5947, + "step": 4202 + }, + { + "epoch": 1.30259456080025, + "grad_norm": 0.26953125, + "learning_rate": 0.0001514609768562512, + "loss": 1.3376, + "step": 4203 + }, + { + "epoch": 1.3029071584870273, + "grad_norm": 0.275390625, + "learning_rate": 0.00015143990789705984, + "loss": 1.6146, + "step": 4204 + }, + { + "epoch": 1.3032197561738044, + "grad_norm": 0.287109375, + "learning_rate": 0.00015141883583237568, + "loss": 1.4284, + "step": 4205 + }, + { + "epoch": 1.3035323538605814, + "grad_norm": 0.255859375, + "learning_rate": 0.00015139776066347088, + "loss": 1.4654, + "step": 4206 + }, + { + "epoch": 1.3038449515473585, + "grad_norm": 0.283203125, + "learning_rate": 0.00015137668239161782, + "loss": 1.4928, + "step": 4207 + }, + { + "epoch": 1.3041575492341357, + "grad_norm": 0.2890625, + "learning_rate": 0.0001513556010180889, + "loss": 1.5986, + "step": 4208 + }, + { + "epoch": 1.3044701469209128, + "grad_norm": 0.275390625, + "learning_rate": 0.00015133451654415696, + "loss": 1.4188, + "step": 4209 + }, + { + "epoch": 1.3047827446076898, + "grad_norm": 0.2890625, + "learning_rate": 0.00015131342897109482, + "loss": 1.5309, + "step": 4210 + }, + { + "epoch": 1.305095342294467, + "grad_norm": 0.291015625, + "learning_rate": 0.00015129233830017558, + "loss": 1.7263, + "step": 4211 + }, + { + "epoch": 1.3054079399812442, + "grad_norm": 0.28515625, + "learning_rate": 0.0001512712445326725, + "loss": 1.3039, + "step": 4212 + }, + { + "epoch": 1.3057205376680212, + "grad_norm": 0.28515625, + "learning_rate": 0.00015125014766985908, + "loss": 1.888, + "step": 4213 + }, + { + "epoch": 1.3060331353547983, + "grad_norm": 0.279296875, + "learning_rate": 0.0001512290477130089, + "loss": 1.4978, + "step": 4214 + }, + { + "epoch": 1.3063457330415755, + "grad_norm": 0.2578125, + "learning_rate": 0.00015120794466339587, + "loss": 1.6995, + "step": 4215 + }, + { + "epoch": 1.3066583307283526, + "grad_norm": 0.255859375, + "learning_rate": 0.00015118683852229393, + "loss": 1.5695, + "step": 4216 + }, + { + "epoch": 1.3069709284151299, + "grad_norm": 0.328125, + "learning_rate": 0.00015116572929097733, + "loss": 2.5649, + "step": 4217 + }, + { + "epoch": 1.307283526101907, + "grad_norm": 0.275390625, + "learning_rate": 0.00015114461697072047, + "loss": 1.3824, + "step": 4218 + }, + { + "epoch": 1.307596123788684, + "grad_norm": 0.275390625, + "learning_rate": 0.0001511235015627979, + "loss": 1.5618, + "step": 4219 + }, + { + "epoch": 1.307908721475461, + "grad_norm": 0.28515625, + "learning_rate": 0.0001511023830684844, + "loss": 1.9096, + "step": 4220 + }, + { + "epoch": 1.3082213191622383, + "grad_norm": 0.267578125, + "learning_rate": 0.0001510812614890549, + "loss": 1.4379, + "step": 4221 + }, + { + "epoch": 1.3085339168490153, + "grad_norm": 0.279296875, + "learning_rate": 0.00015106013682578454, + "loss": 1.3855, + "step": 4222 + }, + { + "epoch": 1.3088465145357924, + "grad_norm": 0.26171875, + "learning_rate": 0.00015103900907994868, + "loss": 1.6518, + "step": 4223 + }, + { + "epoch": 1.3091591122225696, + "grad_norm": 0.279296875, + "learning_rate": 0.0001510178782528228, + "loss": 1.6682, + "step": 4224 + }, + { + "epoch": 1.3094717099093467, + "grad_norm": 0.263671875, + "learning_rate": 0.00015099674434568261, + "loss": 1.4411, + "step": 4225 + }, + { + "epoch": 1.3097843075961237, + "grad_norm": 0.259765625, + "learning_rate": 0.0001509756073598039, + "loss": 1.7525, + "step": 4226 + }, + { + "epoch": 1.3100969052829008, + "grad_norm": 0.28515625, + "learning_rate": 0.0001509544672964629, + "loss": 1.5352, + "step": 4227 + }, + { + "epoch": 1.310409502969678, + "grad_norm": 0.267578125, + "learning_rate": 0.00015093332415693574, + "loss": 1.4847, + "step": 4228 + }, + { + "epoch": 1.3107221006564551, + "grad_norm": 0.27734375, + "learning_rate": 0.0001509121779424989, + "loss": 1.6143, + "step": 4229 + }, + { + "epoch": 1.3110346983432324, + "grad_norm": 0.283203125, + "learning_rate": 0.00015089102865442904, + "loss": 1.6455, + "step": 4230 + }, + { + "epoch": 1.3113472960300094, + "grad_norm": 0.267578125, + "learning_rate": 0.0001508698762940029, + "loss": 1.5467, + "step": 4231 + }, + { + "epoch": 1.3116598937167865, + "grad_norm": 0.263671875, + "learning_rate": 0.00015084872086249746, + "loss": 1.4301, + "step": 4232 + }, + { + "epoch": 1.3119724914035635, + "grad_norm": 0.2578125, + "learning_rate": 0.00015082756236118998, + "loss": 1.7866, + "step": 4233 + }, + { + "epoch": 1.3122850890903408, + "grad_norm": 0.26953125, + "learning_rate": 0.0001508064007913578, + "loss": 1.4359, + "step": 4234 + }, + { + "epoch": 1.3125976867771179, + "grad_norm": 0.265625, + "learning_rate": 0.00015078523615427844, + "loss": 1.2895, + "step": 4235 + }, + { + "epoch": 1.312910284463895, + "grad_norm": 0.2734375, + "learning_rate": 0.0001507640684512297, + "loss": 1.673, + "step": 4236 + }, + { + "epoch": 1.3132228821506722, + "grad_norm": 0.26953125, + "learning_rate": 0.0001507428976834894, + "loss": 1.4894, + "step": 4237 + }, + { + "epoch": 1.3135354798374492, + "grad_norm": 0.26953125, + "learning_rate": 0.00015072172385233575, + "loss": 1.5068, + "step": 4238 + }, + { + "epoch": 1.3138480775242263, + "grad_norm": 0.265625, + "learning_rate": 0.00015070054695904696, + "loss": 1.4145, + "step": 4239 + }, + { + "epoch": 1.3141606752110033, + "grad_norm": 0.26953125, + "learning_rate": 0.00015067936700490154, + "loss": 1.4855, + "step": 4240 + }, + { + "epoch": 1.3144732728977806, + "grad_norm": 0.275390625, + "learning_rate": 0.0001506581839911782, + "loss": 1.5471, + "step": 4241 + }, + { + "epoch": 1.3147858705845576, + "grad_norm": 0.2734375, + "learning_rate": 0.0001506369979191557, + "loss": 1.5411, + "step": 4242 + }, + { + "epoch": 1.315098468271335, + "grad_norm": 0.271484375, + "learning_rate": 0.00015061580879011314, + "loss": 1.4331, + "step": 4243 + }, + { + "epoch": 1.315411065958112, + "grad_norm": 0.2734375, + "learning_rate": 0.00015059461660532966, + "loss": 1.5143, + "step": 4244 + }, + { + "epoch": 1.315723663644889, + "grad_norm": 0.2734375, + "learning_rate": 0.00015057342136608472, + "loss": 1.5241, + "step": 4245 + }, + { + "epoch": 1.316036261331666, + "grad_norm": 0.26171875, + "learning_rate": 0.00015055222307365788, + "loss": 1.7544, + "step": 4246 + }, + { + "epoch": 1.3163488590184433, + "grad_norm": 0.263671875, + "learning_rate": 0.00015053102172932895, + "loss": 1.4044, + "step": 4247 + }, + { + "epoch": 1.3166614567052204, + "grad_norm": 0.28515625, + "learning_rate": 0.0001505098173343778, + "loss": 1.6263, + "step": 4248 + }, + { + "epoch": 1.3169740543919974, + "grad_norm": 0.2734375, + "learning_rate": 0.00015048860989008467, + "loss": 1.3161, + "step": 4249 + }, + { + "epoch": 1.3172866520787747, + "grad_norm": 0.30078125, + "learning_rate": 0.00015046739939772973, + "loss": 1.6841, + "step": 4250 + }, + { + "epoch": 1.3175992497655518, + "grad_norm": 0.263671875, + "learning_rate": 0.00015044618585859367, + "loss": 1.5257, + "step": 4251 + }, + { + "epoch": 1.3179118474523288, + "grad_norm": 0.27734375, + "learning_rate": 0.00015042496927395702, + "loss": 1.4318, + "step": 4252 + }, + { + "epoch": 1.3182244451391059, + "grad_norm": 0.271484375, + "learning_rate": 0.00015040374964510076, + "loss": 1.6423, + "step": 4253 + }, + { + "epoch": 1.3185370428258831, + "grad_norm": 0.271484375, + "learning_rate": 0.00015038252697330588, + "loss": 1.6024, + "step": 4254 + }, + { + "epoch": 1.3188496405126602, + "grad_norm": 0.263671875, + "learning_rate": 0.00015036130125985366, + "loss": 1.4965, + "step": 4255 + }, + { + "epoch": 1.3191622381994375, + "grad_norm": 0.283203125, + "learning_rate": 0.0001503400725060255, + "loss": 1.6, + "step": 4256 + }, + { + "epoch": 1.3194748358862145, + "grad_norm": 0.27734375, + "learning_rate": 0.000150318840713103, + "loss": 1.8479, + "step": 4257 + }, + { + "epoch": 1.3197874335729916, + "grad_norm": 0.27734375, + "learning_rate": 0.00015029760588236796, + "loss": 1.567, + "step": 4258 + }, + { + "epoch": 1.3201000312597686, + "grad_norm": 0.263671875, + "learning_rate": 0.00015027636801510238, + "loss": 1.4983, + "step": 4259 + }, + { + "epoch": 1.3204126289465459, + "grad_norm": 0.28125, + "learning_rate": 0.0001502551271125884, + "loss": 1.448, + "step": 4260 + }, + { + "epoch": 1.320725226633323, + "grad_norm": 0.271484375, + "learning_rate": 0.00015023388317610833, + "loss": 1.8337, + "step": 4261 + }, + { + "epoch": 1.3210378243201, + "grad_norm": 0.267578125, + "learning_rate": 0.00015021263620694476, + "loss": 1.4686, + "step": 4262 + }, + { + "epoch": 1.3213504220068772, + "grad_norm": 0.267578125, + "learning_rate": 0.0001501913862063803, + "loss": 1.6757, + "step": 4263 + }, + { + "epoch": 1.3216630196936543, + "grad_norm": 0.2734375, + "learning_rate": 0.00015017013317569793, + "loss": 1.6048, + "step": 4264 + }, + { + "epoch": 1.3219756173804313, + "grad_norm": 0.271484375, + "learning_rate": 0.0001501488771161807, + "loss": 1.2832, + "step": 4265 + }, + { + "epoch": 1.3222882150672084, + "grad_norm": 0.265625, + "learning_rate": 0.00015012761802911184, + "loss": 1.4612, + "step": 4266 + }, + { + "epoch": 1.3226008127539857, + "grad_norm": 0.263671875, + "learning_rate": 0.00015010635591577477, + "loss": 1.7251, + "step": 4267 + }, + { + "epoch": 1.3229134104407627, + "grad_norm": 0.259765625, + "learning_rate": 0.00015008509077745318, + "loss": 1.6412, + "step": 4268 + }, + { + "epoch": 1.32322600812754, + "grad_norm": 0.2734375, + "learning_rate": 0.00015006382261543083, + "loss": 1.4868, + "step": 4269 + }, + { + "epoch": 1.323538605814317, + "grad_norm": 0.28125, + "learning_rate": 0.0001500425514309917, + "loss": 1.5384, + "step": 4270 + }, + { + "epoch": 1.323851203501094, + "grad_norm": 0.28125, + "learning_rate": 0.00015002127722542, + "loss": 1.7487, + "step": 4271 + }, + { + "epoch": 1.3241638011878711, + "grad_norm": 0.28125, + "learning_rate": 0.00015000000000000001, + "loss": 1.6189, + "step": 4272 + }, + { + "epoch": 1.3244763988746484, + "grad_norm": 0.27734375, + "learning_rate": 0.0001499787197560163, + "loss": 1.5159, + "step": 4273 + }, + { + "epoch": 1.3247889965614255, + "grad_norm": 0.26953125, + "learning_rate": 0.00014995743649475363, + "loss": 1.4979, + "step": 4274 + }, + { + "epoch": 1.3251015942482025, + "grad_norm": 0.275390625, + "learning_rate": 0.00014993615021749684, + "loss": 1.6714, + "step": 4275 + }, + { + "epoch": 1.3254141919349798, + "grad_norm": 0.27734375, + "learning_rate": 0.00014991486092553102, + "loss": 1.4344, + "step": 4276 + }, + { + "epoch": 1.3257267896217568, + "grad_norm": 0.265625, + "learning_rate": 0.00014989356862014146, + "loss": 1.7938, + "step": 4277 + }, + { + "epoch": 1.3260393873085339, + "grad_norm": 0.248046875, + "learning_rate": 0.00014987227330261356, + "loss": 1.6565, + "step": 4278 + }, + { + "epoch": 1.326351984995311, + "grad_norm": 0.263671875, + "learning_rate": 0.00014985097497423298, + "loss": 1.2759, + "step": 4279 + }, + { + "epoch": 1.3266645826820882, + "grad_norm": 0.263671875, + "learning_rate": 0.0001498296736362855, + "loss": 1.799, + "step": 4280 + }, + { + "epoch": 1.3269771803688653, + "grad_norm": 0.263671875, + "learning_rate": 0.00014980836929005714, + "loss": 1.4938, + "step": 4281 + }, + { + "epoch": 1.3272897780556425, + "grad_norm": 0.26171875, + "learning_rate": 0.00014978706193683405, + "loss": 1.422, + "step": 4282 + }, + { + "epoch": 1.3276023757424196, + "grad_norm": 0.26171875, + "learning_rate": 0.00014976575157790262, + "loss": 1.3706, + "step": 4283 + }, + { + "epoch": 1.3279149734291966, + "grad_norm": 0.26953125, + "learning_rate": 0.00014974443821454928, + "loss": 1.4645, + "step": 4284 + }, + { + "epoch": 1.3282275711159737, + "grad_norm": 0.275390625, + "learning_rate": 0.00014972312184806085, + "loss": 1.4608, + "step": 4285 + }, + { + "epoch": 1.328540168802751, + "grad_norm": 0.26953125, + "learning_rate": 0.0001497018024797242, + "loss": 1.4634, + "step": 4286 + }, + { + "epoch": 1.328852766489528, + "grad_norm": 0.2734375, + "learning_rate": 0.00014968048011082638, + "loss": 1.6588, + "step": 4287 + }, + { + "epoch": 1.329165364176305, + "grad_norm": 0.265625, + "learning_rate": 0.00014965915474265468, + "loss": 1.528, + "step": 4288 + }, + { + "epoch": 1.3294779618630823, + "grad_norm": 0.28125, + "learning_rate": 0.00014963782637649652, + "loss": 1.3804, + "step": 4289 + }, + { + "epoch": 1.3297905595498594, + "grad_norm": 0.26953125, + "learning_rate": 0.00014961649501363955, + "loss": 1.6437, + "step": 4290 + }, + { + "epoch": 1.3301031572366364, + "grad_norm": 0.263671875, + "learning_rate": 0.0001495951606553715, + "loss": 1.6061, + "step": 4291 + }, + { + "epoch": 1.3304157549234135, + "grad_norm": 0.275390625, + "learning_rate": 0.00014957382330298046, + "loss": 1.3768, + "step": 4292 + }, + { + "epoch": 1.3307283526101907, + "grad_norm": 0.265625, + "learning_rate": 0.0001495524829577545, + "loss": 1.6755, + "step": 4293 + }, + { + "epoch": 1.3310409502969678, + "grad_norm": 0.279296875, + "learning_rate": 0.00014953113962098201, + "loss": 1.3769, + "step": 4294 + }, + { + "epoch": 1.331353547983745, + "grad_norm": 0.267578125, + "learning_rate": 0.0001495097932939515, + "loss": 1.3683, + "step": 4295 + }, + { + "epoch": 1.331666145670522, + "grad_norm": 0.283203125, + "learning_rate": 0.00014948844397795168, + "loss": 1.4352, + "step": 4296 + }, + { + "epoch": 1.3319787433572992, + "grad_norm": 0.26953125, + "learning_rate": 0.00014946709167427142, + "loss": 1.4394, + "step": 4297 + }, + { + "epoch": 1.3322913410440762, + "grad_norm": 0.2734375, + "learning_rate": 0.0001494457363841998, + "loss": 1.5841, + "step": 4298 + }, + { + "epoch": 1.3326039387308533, + "grad_norm": 0.2734375, + "learning_rate": 0.00014942437810902607, + "loss": 1.46, + "step": 4299 + }, + { + "epoch": 1.3329165364176305, + "grad_norm": 0.275390625, + "learning_rate": 0.00014940301685003967, + "loss": 1.7945, + "step": 4300 + }, + { + "epoch": 1.3332291341044076, + "grad_norm": 0.271484375, + "learning_rate": 0.00014938165260853018, + "loss": 1.557, + "step": 4301 + }, + { + "epoch": 1.3335417317911848, + "grad_norm": 0.28515625, + "learning_rate": 0.0001493602853857874, + "loss": 1.6546, + "step": 4302 + }, + { + "epoch": 1.333854329477962, + "grad_norm": 0.26953125, + "learning_rate": 0.00014933891518310126, + "loss": 1.204, + "step": 4303 + }, + { + "epoch": 1.334166927164739, + "grad_norm": 0.263671875, + "learning_rate": 0.000149317542001762, + "loss": 1.4083, + "step": 4304 + }, + { + "epoch": 1.334479524851516, + "grad_norm": 0.271484375, + "learning_rate": 0.0001492961658430598, + "loss": 1.5677, + "step": 4305 + }, + { + "epoch": 1.3347921225382933, + "grad_norm": 0.27734375, + "learning_rate": 0.0001492747867082853, + "loss": 1.5351, + "step": 4306 + }, + { + "epoch": 1.3351047202250703, + "grad_norm": 0.267578125, + "learning_rate": 0.00014925340459872913, + "loss": 1.635, + "step": 4307 + }, + { + "epoch": 1.3354173179118476, + "grad_norm": 0.259765625, + "learning_rate": 0.00014923201951568216, + "loss": 1.5736, + "step": 4308 + }, + { + "epoch": 1.3357299155986246, + "grad_norm": 0.263671875, + "learning_rate": 0.00014921063146043542, + "loss": 1.6811, + "step": 4309 + }, + { + "epoch": 1.3360425132854017, + "grad_norm": 0.267578125, + "learning_rate": 0.00014918924043428016, + "loss": 1.673, + "step": 4310 + }, + { + "epoch": 1.3363551109721787, + "grad_norm": 0.2734375, + "learning_rate": 0.00014916784643850773, + "loss": 1.6695, + "step": 4311 + }, + { + "epoch": 1.3366677086589558, + "grad_norm": 0.283203125, + "learning_rate": 0.00014914644947440982, + "loss": 1.6808, + "step": 4312 + }, + { + "epoch": 1.336980306345733, + "grad_norm": 0.267578125, + "learning_rate": 0.0001491250495432781, + "loss": 1.2547, + "step": 4313 + }, + { + "epoch": 1.33729290403251, + "grad_norm": 0.2734375, + "learning_rate": 0.00014910364664640454, + "loss": 1.5897, + "step": 4314 + }, + { + "epoch": 1.3376055017192874, + "grad_norm": 0.2890625, + "learning_rate": 0.00014908224078508125, + "loss": 1.5416, + "step": 4315 + }, + { + "epoch": 1.3379180994060644, + "grad_norm": 0.259765625, + "learning_rate": 0.00014906083196060058, + "loss": 1.3056, + "step": 4316 + }, + { + "epoch": 1.3382306970928415, + "grad_norm": 0.275390625, + "learning_rate": 0.0001490394201742549, + "loss": 1.3732, + "step": 4317 + }, + { + "epoch": 1.3385432947796185, + "grad_norm": 0.2734375, + "learning_rate": 0.000149018005427337, + "loss": 1.51, + "step": 4318 + }, + { + "epoch": 1.3388558924663958, + "grad_norm": 0.2890625, + "learning_rate": 0.0001489965877211396, + "loss": 1.6639, + "step": 4319 + }, + { + "epoch": 1.3391684901531729, + "grad_norm": 0.26171875, + "learning_rate": 0.00014897516705695578, + "loss": 1.6686, + "step": 4320 + }, + { + "epoch": 1.33948108783995, + "grad_norm": 0.275390625, + "learning_rate": 0.0001489537434360787, + "loss": 1.5546, + "step": 4321 + }, + { + "epoch": 1.3397936855267272, + "grad_norm": 0.28125, + "learning_rate": 0.00014893231685980175, + "loss": 1.5238, + "step": 4322 + }, + { + "epoch": 1.3401062832135042, + "grad_norm": 0.263671875, + "learning_rate": 0.0001489108873294185, + "loss": 1.7619, + "step": 4323 + }, + { + "epoch": 1.3404188809002813, + "grad_norm": 0.263671875, + "learning_rate": 0.00014888945484622265, + "loss": 1.4808, + "step": 4324 + }, + { + "epoch": 1.3407314785870583, + "grad_norm": 0.275390625, + "learning_rate": 0.0001488680194115081, + "loss": 1.5137, + "step": 4325 + }, + { + "epoch": 1.3410440762738356, + "grad_norm": 0.275390625, + "learning_rate": 0.00014884658102656893, + "loss": 1.4063, + "step": 4326 + }, + { + "epoch": 1.3413566739606126, + "grad_norm": 0.279296875, + "learning_rate": 0.0001488251396926994, + "loss": 1.6675, + "step": 4327 + }, + { + "epoch": 1.34166927164739, + "grad_norm": 0.267578125, + "learning_rate": 0.00014880369541119402, + "loss": 1.415, + "step": 4328 + }, + { + "epoch": 1.341981869334167, + "grad_norm": 0.271484375, + "learning_rate": 0.00014878224818334733, + "loss": 1.5076, + "step": 4329 + }, + { + "epoch": 1.342294467020944, + "grad_norm": 0.27734375, + "learning_rate": 0.0001487607980104542, + "loss": 1.3998, + "step": 4330 + }, + { + "epoch": 1.342607064707721, + "grad_norm": 0.2890625, + "learning_rate": 0.0001487393448938095, + "loss": 1.7118, + "step": 4331 + }, + { + "epoch": 1.3429196623944983, + "grad_norm": 0.25390625, + "learning_rate": 0.00014871788883470845, + "loss": 1.5633, + "step": 4332 + }, + { + "epoch": 1.3432322600812754, + "grad_norm": 0.275390625, + "learning_rate": 0.00014869642983444638, + "loss": 1.5311, + "step": 4333 + }, + { + "epoch": 1.3435448577680524, + "grad_norm": 0.283203125, + "learning_rate": 0.0001486749678943188, + "loss": 1.5362, + "step": 4334 + }, + { + "epoch": 1.3438574554548297, + "grad_norm": 0.271484375, + "learning_rate": 0.00014865350301562134, + "loss": 1.4536, + "step": 4335 + }, + { + "epoch": 1.3441700531416068, + "grad_norm": 0.275390625, + "learning_rate": 0.00014863203519964994, + "loss": 1.5429, + "step": 4336 + }, + { + "epoch": 1.3444826508283838, + "grad_norm": 0.27734375, + "learning_rate": 0.00014861056444770058, + "loss": 1.5523, + "step": 4337 + }, + { + "epoch": 1.3447952485151609, + "grad_norm": 0.263671875, + "learning_rate": 0.0001485890907610695, + "loss": 1.7627, + "step": 4338 + }, + { + "epoch": 1.3451078462019381, + "grad_norm": 0.267578125, + "learning_rate": 0.00014856761414105312, + "loss": 1.5484, + "step": 4339 + }, + { + "epoch": 1.3454204438887152, + "grad_norm": 0.26953125, + "learning_rate": 0.00014854613458894795, + "loss": 1.4169, + "step": 4340 + }, + { + "epoch": 1.3457330415754925, + "grad_norm": 0.271484375, + "learning_rate": 0.00014852465210605077, + "loss": 1.368, + "step": 4341 + }, + { + "epoch": 1.3460456392622695, + "grad_norm": 0.271484375, + "learning_rate": 0.00014850316669365855, + "loss": 1.6043, + "step": 4342 + }, + { + "epoch": 1.3463582369490465, + "grad_norm": 0.27734375, + "learning_rate": 0.00014848167835306833, + "loss": 1.5628, + "step": 4343 + }, + { + "epoch": 1.3466708346358236, + "grad_norm": 0.26171875, + "learning_rate": 0.0001484601870855774, + "loss": 1.5112, + "step": 4344 + }, + { + "epoch": 1.3469834323226009, + "grad_norm": 0.28515625, + "learning_rate": 0.0001484386928924832, + "loss": 1.5425, + "step": 4345 + }, + { + "epoch": 1.347296030009378, + "grad_norm": 0.259765625, + "learning_rate": 0.00014841719577508343, + "loss": 1.6774, + "step": 4346 + }, + { + "epoch": 1.347608627696155, + "grad_norm": 0.271484375, + "learning_rate": 0.0001483956957346758, + "loss": 1.4458, + "step": 4347 + }, + { + "epoch": 1.3479212253829322, + "grad_norm": 0.26953125, + "learning_rate": 0.0001483741927725584, + "loss": 1.6429, + "step": 4348 + }, + { + "epoch": 1.3482338230697093, + "grad_norm": 0.263671875, + "learning_rate": 0.0001483526868900293, + "loss": 1.5116, + "step": 4349 + }, + { + "epoch": 1.3485464207564863, + "grad_norm": 0.279296875, + "learning_rate": 0.0001483311780883869, + "loss": 1.5369, + "step": 4350 + }, + { + "epoch": 1.3488590184432634, + "grad_norm": 0.267578125, + "learning_rate": 0.00014830966636892966, + "loss": 1.2129, + "step": 4351 + }, + { + "epoch": 1.3491716161300407, + "grad_norm": 0.27734375, + "learning_rate": 0.00014828815173295633, + "loss": 1.5552, + "step": 4352 + }, + { + "epoch": 1.3494842138168177, + "grad_norm": 0.26953125, + "learning_rate": 0.00014826663418176573, + "loss": 1.6241, + "step": 4353 + }, + { + "epoch": 1.349796811503595, + "grad_norm": 0.27734375, + "learning_rate": 0.00014824511371665694, + "loss": 1.8416, + "step": 4354 + }, + { + "epoch": 1.350109409190372, + "grad_norm": 0.267578125, + "learning_rate": 0.00014822359033892914, + "loss": 1.8527, + "step": 4355 + }, + { + "epoch": 1.350422006877149, + "grad_norm": 0.287109375, + "learning_rate": 0.00014820206404988175, + "loss": 1.669, + "step": 4356 + }, + { + "epoch": 1.3507346045639261, + "grad_norm": 0.275390625, + "learning_rate": 0.00014818053485081428, + "loss": 1.414, + "step": 4357 + }, + { + "epoch": 1.3510472022507034, + "grad_norm": 0.275390625, + "learning_rate": 0.0001481590027430266, + "loss": 1.3761, + "step": 4358 + }, + { + "epoch": 1.3513597999374805, + "grad_norm": 0.263671875, + "learning_rate": 0.00014813746772781848, + "loss": 1.5144, + "step": 4359 + }, + { + "epoch": 1.3516723976242575, + "grad_norm": 0.27734375, + "learning_rate": 0.00014811592980649017, + "loss": 1.5364, + "step": 4360 + }, + { + "epoch": 1.3519849953110348, + "grad_norm": 0.26171875, + "learning_rate": 0.00014809438898034181, + "loss": 1.6639, + "step": 4361 + }, + { + "epoch": 1.3522975929978118, + "grad_norm": 0.27734375, + "learning_rate": 0.00014807284525067388, + "loss": 1.4128, + "step": 4362 + }, + { + "epoch": 1.3526101906845889, + "grad_norm": 0.267578125, + "learning_rate": 0.00014805129861878707, + "loss": 1.553, + "step": 4363 + }, + { + "epoch": 1.352922788371366, + "grad_norm": 0.2734375, + "learning_rate": 0.00014802974908598212, + "loss": 1.6525, + "step": 4364 + }, + { + "epoch": 1.3532353860581432, + "grad_norm": 0.279296875, + "learning_rate": 0.00014800819665355998, + "loss": 1.3909, + "step": 4365 + }, + { + "epoch": 1.3535479837449202, + "grad_norm": 0.306640625, + "learning_rate": 0.00014798664132282188, + "loss": 1.8208, + "step": 4366 + }, + { + "epoch": 1.3538605814316975, + "grad_norm": 0.275390625, + "learning_rate": 0.00014796508309506906, + "loss": 1.6694, + "step": 4367 + }, + { + "epoch": 1.3541731791184746, + "grad_norm": 0.2734375, + "learning_rate": 0.00014794352197160306, + "loss": 1.5647, + "step": 4368 + }, + { + "epoch": 1.3544857768052516, + "grad_norm": 0.28515625, + "learning_rate": 0.0001479219579537255, + "loss": 1.5541, + "step": 4369 + }, + { + "epoch": 1.3547983744920287, + "grad_norm": 0.267578125, + "learning_rate": 0.00014790039104273833, + "loss": 1.579, + "step": 4370 + }, + { + "epoch": 1.355110972178806, + "grad_norm": 0.267578125, + "learning_rate": 0.0001478788212399435, + "loss": 1.6018, + "step": 4371 + }, + { + "epoch": 1.355423569865583, + "grad_norm": 0.26171875, + "learning_rate": 0.00014785724854664325, + "loss": 1.5671, + "step": 4372 + }, + { + "epoch": 1.35573616755236, + "grad_norm": 0.279296875, + "learning_rate": 0.00014783567296413987, + "loss": 1.8291, + "step": 4373 + }, + { + "epoch": 1.3560487652391373, + "grad_norm": 0.267578125, + "learning_rate": 0.000147814094493736, + "loss": 1.2627, + "step": 4374 + }, + { + "epoch": 1.3563613629259144, + "grad_norm": 0.2890625, + "learning_rate": 0.00014779251313673425, + "loss": 1.4725, + "step": 4375 + }, + { + "epoch": 1.3566739606126914, + "grad_norm": 0.275390625, + "learning_rate": 0.0001477709288944376, + "loss": 1.2851, + "step": 4376 + }, + { + "epoch": 1.3569865582994685, + "grad_norm": 0.271484375, + "learning_rate": 0.00014774934176814915, + "loss": 1.2496, + "step": 4377 + }, + { + "epoch": 1.3572991559862457, + "grad_norm": 0.27734375, + "learning_rate": 0.00014772775175917206, + "loss": 1.6098, + "step": 4378 + }, + { + "epoch": 1.3576117536730228, + "grad_norm": 0.279296875, + "learning_rate": 0.00014770615886880973, + "loss": 1.5777, + "step": 4379 + }, + { + "epoch": 1.3579243513598, + "grad_norm": 0.279296875, + "learning_rate": 0.00014768456309836587, + "loss": 1.6135, + "step": 4380 + }, + { + "epoch": 1.358236949046577, + "grad_norm": 0.259765625, + "learning_rate": 0.00014766296444914412, + "loss": 1.6868, + "step": 4381 + }, + { + "epoch": 1.3585495467333542, + "grad_norm": 0.26953125, + "learning_rate": 0.00014764136292244847, + "loss": 1.5089, + "step": 4382 + }, + { + "epoch": 1.3588621444201312, + "grad_norm": 0.275390625, + "learning_rate": 0.00014761975851958307, + "loss": 1.2554, + "step": 4383 + }, + { + "epoch": 1.3591747421069085, + "grad_norm": 0.2734375, + "learning_rate": 0.00014759815124185214, + "loss": 1.3712, + "step": 4384 + }, + { + "epoch": 1.3594873397936855, + "grad_norm": 0.267578125, + "learning_rate": 0.00014757654109056017, + "loss": 1.5727, + "step": 4385 + }, + { + "epoch": 1.3597999374804626, + "grad_norm": 0.28125, + "learning_rate": 0.0001475549280670118, + "loss": 1.5521, + "step": 4386 + }, + { + "epoch": 1.3601125351672398, + "grad_norm": 0.2734375, + "learning_rate": 0.00014753331217251176, + "loss": 1.6843, + "step": 4387 + }, + { + "epoch": 1.360425132854017, + "grad_norm": 0.263671875, + "learning_rate": 0.00014751169340836514, + "loss": 1.4582, + "step": 4388 + }, + { + "epoch": 1.360737730540794, + "grad_norm": 0.26953125, + "learning_rate": 0.00014749007177587706, + "loss": 1.5996, + "step": 4389 + }, + { + "epoch": 1.361050328227571, + "grad_norm": 0.26171875, + "learning_rate": 0.00014746844727635283, + "loss": 1.5941, + "step": 4390 + }, + { + "epoch": 1.3613629259143483, + "grad_norm": 0.26953125, + "learning_rate": 0.00014744681991109792, + "loss": 1.2932, + "step": 4391 + }, + { + "epoch": 1.3616755236011253, + "grad_norm": 0.283203125, + "learning_rate": 0.00014742518968141803, + "loss": 1.499, + "step": 4392 + }, + { + "epoch": 1.3619881212879026, + "grad_norm": 0.2734375, + "learning_rate": 0.00014740355658861902, + "loss": 1.3968, + "step": 4393 + }, + { + "epoch": 1.3623007189746796, + "grad_norm": 0.279296875, + "learning_rate": 0.00014738192063400689, + "loss": 1.7609, + "step": 4394 + }, + { + "epoch": 1.3626133166614567, + "grad_norm": 0.265625, + "learning_rate": 0.0001473602818188878, + "loss": 1.361, + "step": 4395 + }, + { + "epoch": 1.3629259143482337, + "grad_norm": 0.263671875, + "learning_rate": 0.0001473386401445682, + "loss": 1.4909, + "step": 4396 + }, + { + "epoch": 1.363238512035011, + "grad_norm": 0.27734375, + "learning_rate": 0.00014731699561235453, + "loss": 1.7449, + "step": 4397 + }, + { + "epoch": 1.363551109721788, + "grad_norm": 0.2734375, + "learning_rate": 0.00014729534822355354, + "loss": 1.5876, + "step": 4398 + }, + { + "epoch": 1.363863707408565, + "grad_norm": 0.265625, + "learning_rate": 0.00014727369797947214, + "loss": 1.7072, + "step": 4399 + }, + { + "epoch": 1.3641763050953424, + "grad_norm": 0.28125, + "learning_rate": 0.00014725204488141733, + "loss": 1.603, + "step": 4400 + }, + { + "epoch": 1.3644889027821194, + "grad_norm": 0.287109375, + "learning_rate": 0.00014723038893069636, + "loss": 1.7367, + "step": 4401 + }, + { + "epoch": 1.3648015004688965, + "grad_norm": 0.279296875, + "learning_rate": 0.00014720873012861663, + "loss": 1.6072, + "step": 4402 + }, + { + "epoch": 1.3651140981556735, + "grad_norm": 0.267578125, + "learning_rate": 0.0001471870684764857, + "loss": 1.4582, + "step": 4403 + }, + { + "epoch": 1.3654266958424508, + "grad_norm": 0.26953125, + "learning_rate": 0.00014716540397561128, + "loss": 1.7162, + "step": 4404 + }, + { + "epoch": 1.3657392935292278, + "grad_norm": 0.275390625, + "learning_rate": 0.00014714373662730136, + "loss": 1.465, + "step": 4405 + }, + { + "epoch": 1.3660518912160051, + "grad_norm": 0.275390625, + "learning_rate": 0.00014712206643286398, + "loss": 1.4183, + "step": 4406 + }, + { + "epoch": 1.3663644889027822, + "grad_norm": 0.287109375, + "learning_rate": 0.0001471003933936074, + "loss": 1.8007, + "step": 4407 + }, + { + "epoch": 1.3666770865895592, + "grad_norm": 0.2734375, + "learning_rate": 0.00014707871751084003, + "loss": 1.6014, + "step": 4408 + }, + { + "epoch": 1.3669896842763363, + "grad_norm": 0.28125, + "learning_rate": 0.0001470570387858705, + "loss": 1.5298, + "step": 4409 + }, + { + "epoch": 1.3673022819631135, + "grad_norm": 0.265625, + "learning_rate": 0.00014703535722000762, + "loss": 1.5156, + "step": 4410 + }, + { + "epoch": 1.3676148796498906, + "grad_norm": 0.326171875, + "learning_rate": 0.0001470136728145602, + "loss": 2.1164, + "step": 4411 + }, + { + "epoch": 1.3679274773366676, + "grad_norm": 0.2734375, + "learning_rate": 0.0001469919855708375, + "loss": 1.5791, + "step": 4412 + }, + { + "epoch": 1.368240075023445, + "grad_norm": 0.26171875, + "learning_rate": 0.00014697029549014874, + "loss": 1.5723, + "step": 4413 + }, + { + "epoch": 1.368552672710222, + "grad_norm": 0.26953125, + "learning_rate": 0.00014694860257380337, + "loss": 1.7224, + "step": 4414 + }, + { + "epoch": 1.368865270396999, + "grad_norm": 0.2734375, + "learning_rate": 0.00014692690682311106, + "loss": 1.4698, + "step": 4415 + }, + { + "epoch": 1.369177868083776, + "grad_norm": 0.27734375, + "learning_rate": 0.0001469052082393816, + "loss": 1.6278, + "step": 4416 + }, + { + "epoch": 1.3694904657705533, + "grad_norm": 0.259765625, + "learning_rate": 0.0001468835068239249, + "loss": 1.4451, + "step": 4417 + }, + { + "epoch": 1.3698030634573304, + "grad_norm": 0.279296875, + "learning_rate": 0.00014686180257805117, + "loss": 1.556, + "step": 4418 + }, + { + "epoch": 1.3701156611441077, + "grad_norm": 0.28515625, + "learning_rate": 0.0001468400955030707, + "loss": 1.5471, + "step": 4419 + }, + { + "epoch": 1.3704282588308847, + "grad_norm": 0.275390625, + "learning_rate": 0.00014681838560029397, + "loss": 1.4154, + "step": 4420 + }, + { + "epoch": 1.3707408565176618, + "grad_norm": 0.263671875, + "learning_rate": 0.00014679667287103166, + "loss": 1.5814, + "step": 4421 + }, + { + "epoch": 1.3710534542044388, + "grad_norm": 0.28125, + "learning_rate": 0.00014677495731659453, + "loss": 1.4309, + "step": 4422 + }, + { + "epoch": 1.371366051891216, + "grad_norm": 0.283203125, + "learning_rate": 0.00014675323893829364, + "loss": 1.4414, + "step": 4423 + }, + { + "epoch": 1.3716786495779931, + "grad_norm": 0.2734375, + "learning_rate": 0.0001467315177374402, + "loss": 1.4838, + "step": 4424 + }, + { + "epoch": 1.3719912472647702, + "grad_norm": 0.275390625, + "learning_rate": 0.00014670979371534542, + "loss": 1.5512, + "step": 4425 + }, + { + "epoch": 1.3723038449515474, + "grad_norm": 0.287109375, + "learning_rate": 0.0001466880668733209, + "loss": 1.5403, + "step": 4426 + }, + { + "epoch": 1.3726164426383245, + "grad_norm": 0.267578125, + "learning_rate": 0.00014666633721267824, + "loss": 1.2641, + "step": 4427 + }, + { + "epoch": 1.3729290403251015, + "grad_norm": 0.279296875, + "learning_rate": 0.00014664460473472936, + "loss": 1.4777, + "step": 4428 + }, + { + "epoch": 1.3732416380118786, + "grad_norm": 0.271484375, + "learning_rate": 0.00014662286944078625, + "loss": 1.4161, + "step": 4429 + }, + { + "epoch": 1.3735542356986559, + "grad_norm": 0.271484375, + "learning_rate": 0.00014660113133216113, + "loss": 1.2669, + "step": 4430 + }, + { + "epoch": 1.373866833385433, + "grad_norm": 0.26953125, + "learning_rate": 0.0001465793904101663, + "loss": 1.7179, + "step": 4431 + }, + { + "epoch": 1.3741794310722102, + "grad_norm": 0.275390625, + "learning_rate": 0.00014655764667611434, + "loss": 1.9041, + "step": 4432 + }, + { + "epoch": 1.3744920287589872, + "grad_norm": 0.271484375, + "learning_rate": 0.0001465359001313179, + "loss": 1.5532, + "step": 4433 + }, + { + "epoch": 1.3748046264457643, + "grad_norm": 0.26171875, + "learning_rate": 0.00014651415077708986, + "loss": 1.4049, + "step": 4434 + }, + { + "epoch": 1.3751172241325413, + "grad_norm": 0.28515625, + "learning_rate": 0.00014649239861474323, + "loss": 1.6996, + "step": 4435 + }, + { + "epoch": 1.3754298218193186, + "grad_norm": 0.28125, + "learning_rate": 0.00014647064364559133, + "loss": 1.716, + "step": 4436 + }, + { + "epoch": 1.3757424195060957, + "grad_norm": 0.27734375, + "learning_rate": 0.0001464488858709474, + "loss": 1.4723, + "step": 4437 + }, + { + "epoch": 1.3760550171928727, + "grad_norm": 0.26171875, + "learning_rate": 0.0001464271252921251, + "loss": 1.6228, + "step": 4438 + }, + { + "epoch": 1.37636761487965, + "grad_norm": 0.27734375, + "learning_rate": 0.000146405361910438, + "loss": 1.4123, + "step": 4439 + }, + { + "epoch": 1.376680212566427, + "grad_norm": 0.27734375, + "learning_rate": 0.00014638359572720014, + "loss": 1.7919, + "step": 4440 + }, + { + "epoch": 1.376992810253204, + "grad_norm": 0.27734375, + "learning_rate": 0.00014636182674372542, + "loss": 1.5778, + "step": 4441 + }, + { + "epoch": 1.3773054079399811, + "grad_norm": 0.259765625, + "learning_rate": 0.0001463400549613282, + "loss": 1.4883, + "step": 4442 + }, + { + "epoch": 1.3776180056267584, + "grad_norm": 0.28515625, + "learning_rate": 0.0001463182803813228, + "loss": 1.6173, + "step": 4443 + }, + { + "epoch": 1.3779306033135355, + "grad_norm": 0.287109375, + "learning_rate": 0.0001462965030050238, + "loss": 1.7092, + "step": 4444 + }, + { + "epoch": 1.3782432010003127, + "grad_norm": 0.28515625, + "learning_rate": 0.0001462747228337459, + "loss": 1.3684, + "step": 4445 + }, + { + "epoch": 1.3785557986870898, + "grad_norm": 0.2734375, + "learning_rate": 0.00014625293986880402, + "loss": 1.684, + "step": 4446 + }, + { + "epoch": 1.3788683963738668, + "grad_norm": 0.263671875, + "learning_rate": 0.0001462311541115132, + "loss": 1.4425, + "step": 4447 + }, + { + "epoch": 1.3791809940606439, + "grad_norm": 0.287109375, + "learning_rate": 0.00014620936556318872, + "loss": 1.4826, + "step": 4448 + }, + { + "epoch": 1.3794935917474211, + "grad_norm": 0.28515625, + "learning_rate": 0.00014618757422514596, + "loss": 1.5377, + "step": 4449 + }, + { + "epoch": 1.3798061894341982, + "grad_norm": 0.27734375, + "learning_rate": 0.00014616578009870045, + "loss": 1.7899, + "step": 4450 + }, + { + "epoch": 1.3801187871209752, + "grad_norm": 0.27734375, + "learning_rate": 0.000146143983185168, + "loss": 1.4614, + "step": 4451 + }, + { + "epoch": 1.3804313848077525, + "grad_norm": 0.291015625, + "learning_rate": 0.00014612218348586446, + "loss": 1.6547, + "step": 4452 + }, + { + "epoch": 1.3807439824945296, + "grad_norm": 0.2890625, + "learning_rate": 0.00014610038100210592, + "loss": 1.6675, + "step": 4453 + }, + { + "epoch": 1.3810565801813066, + "grad_norm": 0.259765625, + "learning_rate": 0.00014607857573520867, + "loss": 1.3751, + "step": 4454 + }, + { + "epoch": 1.3813691778680837, + "grad_norm": 0.287109375, + "learning_rate": 0.00014605676768648907, + "loss": 1.5265, + "step": 4455 + }, + { + "epoch": 1.381681775554861, + "grad_norm": 0.27734375, + "learning_rate": 0.00014603495685726372, + "loss": 1.6233, + "step": 4456 + }, + { + "epoch": 1.381994373241638, + "grad_norm": 0.271484375, + "learning_rate": 0.00014601314324884935, + "loss": 1.6946, + "step": 4457 + }, + { + "epoch": 1.3823069709284153, + "grad_norm": 0.263671875, + "learning_rate": 0.0001459913268625629, + "loss": 1.523, + "step": 4458 + }, + { + "epoch": 1.3826195686151923, + "grad_norm": 0.26953125, + "learning_rate": 0.0001459695076997214, + "loss": 1.3612, + "step": 4459 + }, + { + "epoch": 1.3829321663019694, + "grad_norm": 0.28125, + "learning_rate": 0.00014594768576164216, + "loss": 1.7067, + "step": 4460 + }, + { + "epoch": 1.3832447639887464, + "grad_norm": 0.279296875, + "learning_rate": 0.00014592586104964262, + "loss": 1.5837, + "step": 4461 + }, + { + "epoch": 1.3835573616755237, + "grad_norm": 0.34765625, + "learning_rate": 0.0001459040335650403, + "loss": 2.4029, + "step": 4462 + }, + { + "epoch": 1.3838699593623007, + "grad_norm": 0.26953125, + "learning_rate": 0.00014588220330915297, + "loss": 1.672, + "step": 4463 + }, + { + "epoch": 1.3841825570490778, + "grad_norm": 0.275390625, + "learning_rate": 0.00014586037028329856, + "loss": 1.3609, + "step": 4464 + }, + { + "epoch": 1.384495154735855, + "grad_norm": 0.291015625, + "learning_rate": 0.00014583853448879515, + "loss": 1.5009, + "step": 4465 + }, + { + "epoch": 1.384807752422632, + "grad_norm": 0.267578125, + "learning_rate": 0.000145816695926961, + "loss": 1.5723, + "step": 4466 + }, + { + "epoch": 1.3851203501094091, + "grad_norm": 0.28125, + "learning_rate": 0.00014579485459911452, + "loss": 1.3228, + "step": 4467 + }, + { + "epoch": 1.3854329477961862, + "grad_norm": 0.2890625, + "learning_rate": 0.00014577301050657432, + "loss": 1.6831, + "step": 4468 + }, + { + "epoch": 1.3857455454829635, + "grad_norm": 0.27734375, + "learning_rate": 0.00014575116365065917, + "loss": 1.5932, + "step": 4469 + }, + { + "epoch": 1.3860581431697405, + "grad_norm": 0.271484375, + "learning_rate": 0.00014572931403268794, + "loss": 1.6896, + "step": 4470 + }, + { + "epoch": 1.3863707408565178, + "grad_norm": 0.255859375, + "learning_rate": 0.0001457074616539798, + "loss": 1.5783, + "step": 4471 + }, + { + "epoch": 1.3866833385432948, + "grad_norm": 0.283203125, + "learning_rate": 0.00014568560651585387, + "loss": 1.7211, + "step": 4472 + }, + { + "epoch": 1.386995936230072, + "grad_norm": 0.28515625, + "learning_rate": 0.00014566374861962972, + "loss": 1.4425, + "step": 4473 + }, + { + "epoch": 1.387308533916849, + "grad_norm": 0.28515625, + "learning_rate": 0.00014564188796662686, + "loss": 1.4965, + "step": 4474 + }, + { + "epoch": 1.3876211316036262, + "grad_norm": 0.283203125, + "learning_rate": 0.00014562002455816505, + "loss": 1.8338, + "step": 4475 + }, + { + "epoch": 1.3879337292904033, + "grad_norm": 0.26171875, + "learning_rate": 0.00014559815839556425, + "loss": 1.59, + "step": 4476 + }, + { + "epoch": 1.3882463269771803, + "grad_norm": 0.287109375, + "learning_rate": 0.0001455762894801445, + "loss": 1.5326, + "step": 4477 + }, + { + "epoch": 1.3885589246639576, + "grad_norm": 0.271484375, + "learning_rate": 0.00014555441781322607, + "loss": 1.6449, + "step": 4478 + }, + { + "epoch": 1.3888715223507346, + "grad_norm": 0.279296875, + "learning_rate": 0.00014553254339612942, + "loss": 1.646, + "step": 4479 + }, + { + "epoch": 1.3891841200375117, + "grad_norm": 0.27734375, + "learning_rate": 0.00014551066623017507, + "loss": 1.4298, + "step": 4480 + }, + { + "epoch": 1.3894967177242887, + "grad_norm": 0.275390625, + "learning_rate": 0.00014548878631668382, + "loss": 1.7506, + "step": 4481 + }, + { + "epoch": 1.389809315411066, + "grad_norm": 0.283203125, + "learning_rate": 0.00014546690365697658, + "loss": 1.4685, + "step": 4482 + }, + { + "epoch": 1.390121913097843, + "grad_norm": 0.271484375, + "learning_rate": 0.0001454450182523744, + "loss": 1.756, + "step": 4483 + }, + { + "epoch": 1.3904345107846203, + "grad_norm": 0.283203125, + "learning_rate": 0.00014542313010419857, + "loss": 1.3334, + "step": 4484 + }, + { + "epoch": 1.3907471084713974, + "grad_norm": 0.2734375, + "learning_rate": 0.00014540123921377048, + "loss": 1.6596, + "step": 4485 + }, + { + "epoch": 1.3910597061581744, + "grad_norm": 0.271484375, + "learning_rate": 0.00014537934558241173, + "loss": 1.3967, + "step": 4486 + }, + { + "epoch": 1.3913723038449515, + "grad_norm": 0.2734375, + "learning_rate": 0.00014535744921144407, + "loss": 1.6193, + "step": 4487 + }, + { + "epoch": 1.3916849015317287, + "grad_norm": 0.287109375, + "learning_rate": 0.00014533555010218943, + "loss": 1.5161, + "step": 4488 + }, + { + "epoch": 1.3919974992185058, + "grad_norm": 0.271484375, + "learning_rate": 0.00014531364825596981, + "loss": 1.3273, + "step": 4489 + }, + { + "epoch": 1.3923100969052828, + "grad_norm": 0.271484375, + "learning_rate": 0.0001452917436741075, + "loss": 1.5277, + "step": 4490 + }, + { + "epoch": 1.3926226945920601, + "grad_norm": 0.275390625, + "learning_rate": 0.00014526983635792495, + "loss": 1.6286, + "step": 4491 + }, + { + "epoch": 1.3929352922788372, + "grad_norm": 0.263671875, + "learning_rate": 0.00014524792630874465, + "loss": 1.7798, + "step": 4492 + }, + { + "epoch": 1.3932478899656142, + "grad_norm": 0.267578125, + "learning_rate": 0.0001452260135278894, + "loss": 1.3343, + "step": 4493 + }, + { + "epoch": 1.3935604876523913, + "grad_norm": 0.26953125, + "learning_rate": 0.00014520409801668208, + "loss": 1.5668, + "step": 4494 + }, + { + "epoch": 1.3938730853391685, + "grad_norm": 0.26953125, + "learning_rate": 0.00014518217977644576, + "loss": 1.8438, + "step": 4495 + }, + { + "epoch": 1.3941856830259456, + "grad_norm": 0.27734375, + "learning_rate": 0.00014516025880850365, + "loss": 1.4986, + "step": 4496 + }, + { + "epoch": 1.3944982807127229, + "grad_norm": 0.271484375, + "learning_rate": 0.0001451383351141792, + "loss": 1.5536, + "step": 4497 + }, + { + "epoch": 1.3948108783995, + "grad_norm": 0.267578125, + "learning_rate": 0.00014511640869479593, + "loss": 1.317, + "step": 4498 + }, + { + "epoch": 1.395123476086277, + "grad_norm": 0.271484375, + "learning_rate": 0.00014509447955167757, + "loss": 1.4538, + "step": 4499 + }, + { + "epoch": 1.395436073773054, + "grad_norm": 0.29296875, + "learning_rate": 0.000145072547686148, + "loss": 1.4144, + "step": 4500 + }, + { + "epoch": 1.3957486714598313, + "grad_norm": 0.27734375, + "learning_rate": 0.00014505061309953132, + "loss": 1.673, + "step": 4501 + }, + { + "epoch": 1.3960612691466083, + "grad_norm": 0.30078125, + "learning_rate": 0.0001450286757931517, + "loss": 1.5612, + "step": 4502 + }, + { + "epoch": 1.3963738668333854, + "grad_norm": 0.2890625, + "learning_rate": 0.00014500673576833353, + "loss": 1.8646, + "step": 4503 + }, + { + "epoch": 1.3966864645201627, + "grad_norm": 0.267578125, + "learning_rate": 0.00014498479302640138, + "loss": 1.4747, + "step": 4504 + }, + { + "epoch": 1.3969990622069397, + "grad_norm": 0.271484375, + "learning_rate": 0.00014496284756867995, + "loss": 1.5751, + "step": 4505 + }, + { + "epoch": 1.3973116598937168, + "grad_norm": 0.28125, + "learning_rate": 0.0001449408993964941, + "loss": 1.631, + "step": 4506 + }, + { + "epoch": 1.3976242575804938, + "grad_norm": 0.291015625, + "learning_rate": 0.00014491894851116895, + "loss": 1.366, + "step": 4507 + }, + { + "epoch": 1.397936855267271, + "grad_norm": 0.263671875, + "learning_rate": 0.00014489699491402957, + "loss": 1.5104, + "step": 4508 + }, + { + "epoch": 1.3982494529540481, + "grad_norm": 0.296875, + "learning_rate": 0.00014487503860640143, + "loss": 1.9793, + "step": 4509 + }, + { + "epoch": 1.3985620506408254, + "grad_norm": 0.275390625, + "learning_rate": 0.00014485307958960999, + "loss": 1.5269, + "step": 4510 + }, + { + "epoch": 1.3988746483276024, + "grad_norm": 0.2734375, + "learning_rate": 0.000144831117864981, + "loss": 1.8076, + "step": 4511 + }, + { + "epoch": 1.3991872460143795, + "grad_norm": 0.275390625, + "learning_rate": 0.00014480915343384026, + "loss": 1.5925, + "step": 4512 + }, + { + "epoch": 1.3994998437011565, + "grad_norm": 0.2890625, + "learning_rate": 0.0001447871862975139, + "loss": 1.7906, + "step": 4513 + }, + { + "epoch": 1.3998124413879336, + "grad_norm": 0.27734375, + "learning_rate": 0.00014476521645732796, + "loss": 1.5804, + "step": 4514 + }, + { + "epoch": 1.4001250390747109, + "grad_norm": 0.279296875, + "learning_rate": 0.0001447432439146089, + "loss": 1.552, + "step": 4515 + }, + { + "epoch": 1.400437636761488, + "grad_norm": 0.27734375, + "learning_rate": 0.00014472126867068317, + "loss": 1.3919, + "step": 4516 + }, + { + "epoch": 1.4007502344482652, + "grad_norm": 0.2734375, + "learning_rate": 0.00014469929072687747, + "loss": 1.7584, + "step": 4517 + }, + { + "epoch": 1.4010628321350422, + "grad_norm": 0.279296875, + "learning_rate": 0.00014467731008451862, + "loss": 1.5451, + "step": 4518 + }, + { + "epoch": 1.4013754298218193, + "grad_norm": 0.26953125, + "learning_rate": 0.00014465532674493364, + "loss": 1.3442, + "step": 4519 + }, + { + "epoch": 1.4016880275085963, + "grad_norm": 0.271484375, + "learning_rate": 0.00014463334070944966, + "loss": 1.6606, + "step": 4520 + }, + { + "epoch": 1.4020006251953736, + "grad_norm": 0.2734375, + "learning_rate": 0.00014461135197939406, + "loss": 1.7989, + "step": 4521 + }, + { + "epoch": 1.4023132228821507, + "grad_norm": 0.279296875, + "learning_rate": 0.00014458936055609424, + "loss": 1.5901, + "step": 4522 + }, + { + "epoch": 1.402625820568928, + "grad_norm": 0.283203125, + "learning_rate": 0.00014456736644087793, + "loss": 1.5985, + "step": 4523 + }, + { + "epoch": 1.402938418255705, + "grad_norm": 0.2890625, + "learning_rate": 0.00014454536963507292, + "loss": 1.7222, + "step": 4524 + }, + { + "epoch": 1.403251015942482, + "grad_norm": 0.271484375, + "learning_rate": 0.0001445233701400072, + "loss": 1.6968, + "step": 4525 + }, + { + "epoch": 1.403563613629259, + "grad_norm": 0.279296875, + "learning_rate": 0.00014450136795700886, + "loss": 1.4162, + "step": 4526 + }, + { + "epoch": 1.4038762113160361, + "grad_norm": 0.267578125, + "learning_rate": 0.0001444793630874062, + "loss": 1.4792, + "step": 4527 + }, + { + "epoch": 1.4041888090028134, + "grad_norm": 0.271484375, + "learning_rate": 0.00014445735553252775, + "loss": 1.5032, + "step": 4528 + }, + { + "epoch": 1.4045014066895904, + "grad_norm": 0.27734375, + "learning_rate": 0.0001444353452937021, + "loss": 1.4928, + "step": 4529 + }, + { + "epoch": 1.4048140043763677, + "grad_norm": 0.265625, + "learning_rate": 0.00014441333237225803, + "loss": 1.5954, + "step": 4530 + }, + { + "epoch": 1.4051266020631448, + "grad_norm": 0.275390625, + "learning_rate": 0.00014439131676952446, + "loss": 1.4347, + "step": 4531 + }, + { + "epoch": 1.4054391997499218, + "grad_norm": 0.2734375, + "learning_rate": 0.00014436929848683053, + "loss": 1.4861, + "step": 4532 + }, + { + "epoch": 1.4057517974366989, + "grad_norm": 0.28125, + "learning_rate": 0.00014434727752550555, + "loss": 1.8632, + "step": 4533 + }, + { + "epoch": 1.4060643951234761, + "grad_norm": 0.287109375, + "learning_rate": 0.00014432525388687886, + "loss": 1.7035, + "step": 4534 + }, + { + "epoch": 1.4063769928102532, + "grad_norm": 0.283203125, + "learning_rate": 0.00014430322757228014, + "loss": 1.4831, + "step": 4535 + }, + { + "epoch": 1.4066895904970302, + "grad_norm": 0.279296875, + "learning_rate": 0.00014428119858303913, + "loss": 1.7658, + "step": 4536 + }, + { + "epoch": 1.4070021881838075, + "grad_norm": 0.275390625, + "learning_rate": 0.00014425916692048572, + "loss": 1.6409, + "step": 4537 + }, + { + "epoch": 1.4073147858705846, + "grad_norm": 0.271484375, + "learning_rate": 0.00014423713258594997, + "loss": 1.4858, + "step": 4538 + }, + { + "epoch": 1.4076273835573616, + "grad_norm": 0.2734375, + "learning_rate": 0.0001442150955807622, + "loss": 1.3965, + "step": 4539 + }, + { + "epoch": 1.4079399812441387, + "grad_norm": 0.26953125, + "learning_rate": 0.00014419305590625272, + "loss": 1.4889, + "step": 4540 + }, + { + "epoch": 1.408252578930916, + "grad_norm": 0.265625, + "learning_rate": 0.00014417101356375218, + "loss": 1.4343, + "step": 4541 + }, + { + "epoch": 1.408565176617693, + "grad_norm": 0.275390625, + "learning_rate": 0.00014414896855459124, + "loss": 1.3446, + "step": 4542 + }, + { + "epoch": 1.4088777743044703, + "grad_norm": 0.271484375, + "learning_rate": 0.00014412692088010083, + "loss": 1.5692, + "step": 4543 + }, + { + "epoch": 1.4091903719912473, + "grad_norm": 0.267578125, + "learning_rate": 0.00014410487054161196, + "loss": 1.4879, + "step": 4544 + }, + { + "epoch": 1.4095029696780244, + "grad_norm": 0.28125, + "learning_rate": 0.00014408281754045585, + "loss": 1.5013, + "step": 4545 + }, + { + "epoch": 1.4098155673648014, + "grad_norm": 0.265625, + "learning_rate": 0.00014406076187796387, + "loss": 1.4588, + "step": 4546 + }, + { + "epoch": 1.4101281650515787, + "grad_norm": 0.275390625, + "learning_rate": 0.00014403870355546755, + "loss": 1.6314, + "step": 4547 + }, + { + "epoch": 1.4104407627383557, + "grad_norm": 0.26953125, + "learning_rate": 0.00014401664257429858, + "loss": 1.5477, + "step": 4548 + }, + { + "epoch": 1.4107533604251328, + "grad_norm": 0.267578125, + "learning_rate": 0.00014399457893578884, + "loss": 1.5232, + "step": 4549 + }, + { + "epoch": 1.41106595811191, + "grad_norm": 0.27734375, + "learning_rate": 0.00014397251264127022, + "loss": 1.7653, + "step": 4550 + }, + { + "epoch": 1.411378555798687, + "grad_norm": 0.275390625, + "learning_rate": 0.00014395044369207505, + "loss": 1.8346, + "step": 4551 + }, + { + "epoch": 1.4116911534854641, + "grad_norm": 0.26953125, + "learning_rate": 0.00014392837208953557, + "loss": 1.6111, + "step": 4552 + }, + { + "epoch": 1.4120037511722412, + "grad_norm": 0.298828125, + "learning_rate": 0.00014390629783498428, + "loss": 1.6006, + "step": 4553 + }, + { + "epoch": 1.4123163488590185, + "grad_norm": 0.263671875, + "learning_rate": 0.00014388422092975387, + "loss": 1.3409, + "step": 4554 + }, + { + "epoch": 1.4126289465457955, + "grad_norm": 0.2734375, + "learning_rate": 0.00014386214137517708, + "loss": 1.4263, + "step": 4555 + }, + { + "epoch": 1.4129415442325728, + "grad_norm": 0.27734375, + "learning_rate": 0.00014384005917258695, + "loss": 1.4879, + "step": 4556 + }, + { + "epoch": 1.4132541419193498, + "grad_norm": 0.275390625, + "learning_rate": 0.00014381797432331658, + "loss": 1.6945, + "step": 4557 + }, + { + "epoch": 1.4135667396061269, + "grad_norm": 0.255859375, + "learning_rate": 0.0001437958868286992, + "loss": 1.3465, + "step": 4558 + }, + { + "epoch": 1.413879337292904, + "grad_norm": 0.263671875, + "learning_rate": 0.0001437737966900684, + "loss": 1.3708, + "step": 4559 + }, + { + "epoch": 1.4141919349796812, + "grad_norm": 0.287109375, + "learning_rate": 0.00014375170390875768, + "loss": 1.5018, + "step": 4560 + }, + { + "epoch": 1.4145045326664583, + "grad_norm": 0.2890625, + "learning_rate": 0.00014372960848610085, + "loss": 1.428, + "step": 4561 + }, + { + "epoch": 1.4148171303532353, + "grad_norm": 0.26953125, + "learning_rate": 0.00014370751042343182, + "loss": 1.3923, + "step": 4562 + }, + { + "epoch": 1.4151297280400126, + "grad_norm": 0.251953125, + "learning_rate": 0.00014368540972208471, + "loss": 1.3904, + "step": 4563 + }, + { + "epoch": 1.4154423257267896, + "grad_norm": 0.2734375, + "learning_rate": 0.0001436633063833937, + "loss": 1.7971, + "step": 4564 + }, + { + "epoch": 1.4157549234135667, + "grad_norm": 0.271484375, + "learning_rate": 0.0001436412004086933, + "loss": 1.3722, + "step": 4565 + }, + { + "epoch": 1.4160675211003437, + "grad_norm": 0.275390625, + "learning_rate": 0.000143619091799318, + "loss": 1.7189, + "step": 4566 + }, + { + "epoch": 1.416380118787121, + "grad_norm": 0.27734375, + "learning_rate": 0.00014359698055660256, + "loss": 1.4373, + "step": 4567 + }, + { + "epoch": 1.416692716473898, + "grad_norm": 0.2734375, + "learning_rate": 0.0001435748666818818, + "loss": 1.5061, + "step": 4568 + }, + { + "epoch": 1.4170053141606753, + "grad_norm": 0.27734375, + "learning_rate": 0.00014355275017649083, + "loss": 1.5032, + "step": 4569 + }, + { + "epoch": 1.4173179118474524, + "grad_norm": 0.283203125, + "learning_rate": 0.00014353063104176483, + "loss": 1.5084, + "step": 4570 + }, + { + "epoch": 1.4176305095342294, + "grad_norm": 0.29296875, + "learning_rate": 0.0001435085092790392, + "loss": 1.4757, + "step": 4571 + }, + { + "epoch": 1.4179431072210065, + "grad_norm": 0.291015625, + "learning_rate": 0.00014348638488964938, + "loss": 1.5824, + "step": 4572 + }, + { + "epoch": 1.4182557049077837, + "grad_norm": 0.2734375, + "learning_rate": 0.00014346425787493112, + "loss": 1.7873, + "step": 4573 + }, + { + "epoch": 1.4185683025945608, + "grad_norm": 0.28515625, + "learning_rate": 0.00014344212823622017, + "loss": 1.6968, + "step": 4574 + }, + { + "epoch": 1.4188809002813378, + "grad_norm": 0.2578125, + "learning_rate": 0.00014341999597485266, + "loss": 1.776, + "step": 4575 + }, + { + "epoch": 1.4191934979681151, + "grad_norm": 0.2890625, + "learning_rate": 0.00014339786109216458, + "loss": 1.7142, + "step": 4576 + }, + { + "epoch": 1.4195060956548922, + "grad_norm": 0.255859375, + "learning_rate": 0.00014337572358949242, + "loss": 1.4147, + "step": 4577 + }, + { + "epoch": 1.4198186933416692, + "grad_norm": 0.271484375, + "learning_rate": 0.00014335358346817244, + "loss": 1.6323, + "step": 4578 + }, + { + "epoch": 1.4201312910284463, + "grad_norm": 0.275390625, + "learning_rate": 0.00014333144072954144, + "loss": 1.6688, + "step": 4579 + }, + { + "epoch": 1.4204438887152235, + "grad_norm": 0.28125, + "learning_rate": 0.00014330929537493615, + "loss": 1.6052, + "step": 4580 + }, + { + "epoch": 1.4207564864020006, + "grad_norm": 0.275390625, + "learning_rate": 0.0001432871474056935, + "loss": 1.4074, + "step": 4581 + }, + { + "epoch": 1.4210690840887779, + "grad_norm": 0.26171875, + "learning_rate": 0.00014326499682315057, + "loss": 1.4452, + "step": 4582 + }, + { + "epoch": 1.421381681775555, + "grad_norm": 0.275390625, + "learning_rate": 0.00014324284362864472, + "loss": 1.6482, + "step": 4583 + }, + { + "epoch": 1.421694279462332, + "grad_norm": 0.279296875, + "learning_rate": 0.00014322068782351323, + "loss": 1.5454, + "step": 4584 + }, + { + "epoch": 1.422006877149109, + "grad_norm": 0.291015625, + "learning_rate": 0.00014319852940909377, + "loss": 1.4762, + "step": 4585 + }, + { + "epoch": 1.4223194748358863, + "grad_norm": 0.287109375, + "learning_rate": 0.00014317636838672402, + "loss": 1.4904, + "step": 4586 + }, + { + "epoch": 1.4226320725226633, + "grad_norm": 0.28515625, + "learning_rate": 0.0001431542047577419, + "loss": 1.4558, + "step": 4587 + }, + { + "epoch": 1.4229446702094404, + "grad_norm": 0.28125, + "learning_rate": 0.00014313203852348545, + "loss": 1.5608, + "step": 4588 + }, + { + "epoch": 1.4232572678962176, + "grad_norm": 0.38671875, + "learning_rate": 0.0001431098696852929, + "loss": 2.2835, + "step": 4589 + }, + { + "epoch": 1.4235698655829947, + "grad_norm": 0.2734375, + "learning_rate": 0.00014308769824450252, + "loss": 1.5749, + "step": 4590 + }, + { + "epoch": 1.4238824632697717, + "grad_norm": 0.279296875, + "learning_rate": 0.00014306552420245293, + "loss": 1.7626, + "step": 4591 + }, + { + "epoch": 1.4241950609565488, + "grad_norm": 0.2734375, + "learning_rate": 0.00014304334756048273, + "loss": 1.4544, + "step": 4592 + }, + { + "epoch": 1.424507658643326, + "grad_norm": 0.27734375, + "learning_rate": 0.00014302116831993084, + "loss": 1.4499, + "step": 4593 + }, + { + "epoch": 1.4248202563301031, + "grad_norm": 0.287109375, + "learning_rate": 0.00014299898648213616, + "loss": 1.5541, + "step": 4594 + }, + { + "epoch": 1.4251328540168804, + "grad_norm": 0.26953125, + "learning_rate": 0.0001429768020484379, + "loss": 1.3729, + "step": 4595 + }, + { + "epoch": 1.4254454517036574, + "grad_norm": 0.271484375, + "learning_rate": 0.00014295461502017532, + "loss": 1.5893, + "step": 4596 + }, + { + "epoch": 1.4257580493904345, + "grad_norm": 0.271484375, + "learning_rate": 0.0001429324253986879, + "loss": 1.4734, + "step": 4597 + }, + { + "epoch": 1.4260706470772115, + "grad_norm": 0.28125, + "learning_rate": 0.0001429102331853152, + "loss": 1.7623, + "step": 4598 + }, + { + "epoch": 1.4263832447639888, + "grad_norm": 0.28125, + "learning_rate": 0.0001428880383813971, + "loss": 1.4604, + "step": 4599 + }, + { + "epoch": 1.4266958424507659, + "grad_norm": 0.291015625, + "learning_rate": 0.00014286584098827345, + "loss": 1.4894, + "step": 4600 + }, + { + "epoch": 1.427008440137543, + "grad_norm": 0.267578125, + "learning_rate": 0.0001428436410072844, + "loss": 1.743, + "step": 4601 + }, + { + "epoch": 1.4273210378243202, + "grad_norm": 0.267578125, + "learning_rate": 0.00014282143843977004, + "loss": 1.3815, + "step": 4602 + }, + { + "epoch": 1.4276336355110972, + "grad_norm": 0.265625, + "learning_rate": 0.00014279923328707096, + "loss": 1.4671, + "step": 4603 + }, + { + "epoch": 1.4279462331978743, + "grad_norm": 0.291015625, + "learning_rate": 0.0001427770255505276, + "loss": 1.9332, + "step": 4604 + }, + { + "epoch": 1.4282588308846513, + "grad_norm": 0.283203125, + "learning_rate": 0.0001427548152314807, + "loss": 1.6423, + "step": 4605 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.2890625, + "learning_rate": 0.0001427326023312711, + "loss": 1.7537, + "step": 4606 + }, + { + "epoch": 1.4288840262582057, + "grad_norm": 0.283203125, + "learning_rate": 0.0001427103868512399, + "loss": 1.4938, + "step": 4607 + }, + { + "epoch": 1.429196623944983, + "grad_norm": 0.283203125, + "learning_rate": 0.00014268816879272817, + "loss": 1.434, + "step": 4608 + }, + { + "epoch": 1.42950922163176, + "grad_norm": 0.279296875, + "learning_rate": 0.0001426659481570773, + "loss": 1.5592, + "step": 4609 + }, + { + "epoch": 1.429821819318537, + "grad_norm": 0.28125, + "learning_rate": 0.00014264372494562878, + "loss": 1.7722, + "step": 4610 + }, + { + "epoch": 1.430134417005314, + "grad_norm": 0.29296875, + "learning_rate": 0.00014262149915972422, + "loss": 1.5044, + "step": 4611 + }, + { + "epoch": 1.4304470146920913, + "grad_norm": 0.2578125, + "learning_rate": 0.00014259927080070546, + "loss": 1.804, + "step": 4612 + }, + { + "epoch": 1.4307596123788684, + "grad_norm": 0.271484375, + "learning_rate": 0.00014257703986991446, + "loss": 1.2922, + "step": 4613 + }, + { + "epoch": 1.4310722100656454, + "grad_norm": 0.283203125, + "learning_rate": 0.00014255480636869328, + "loss": 1.646, + "step": 4614 + }, + { + "epoch": 1.4313848077524227, + "grad_norm": 0.279296875, + "learning_rate": 0.0001425325702983842, + "loss": 1.4114, + "step": 4615 + }, + { + "epoch": 1.4316974054391998, + "grad_norm": 0.291015625, + "learning_rate": 0.00014251033166032964, + "loss": 1.7375, + "step": 4616 + }, + { + "epoch": 1.4320100031259768, + "grad_norm": 0.2578125, + "learning_rate": 0.00014248809045587223, + "loss": 1.4065, + "step": 4617 + }, + { + "epoch": 1.4323226008127539, + "grad_norm": 0.275390625, + "learning_rate": 0.00014246584668635464, + "loss": 1.4145, + "step": 4618 + }, + { + "epoch": 1.4326351984995311, + "grad_norm": 0.2734375, + "learning_rate": 0.00014244360035311977, + "loss": 1.576, + "step": 4619 + }, + { + "epoch": 1.4329477961863082, + "grad_norm": 0.28125, + "learning_rate": 0.0001424213514575107, + "loss": 1.581, + "step": 4620 + }, + { + "epoch": 1.4332603938730855, + "grad_norm": 0.28125, + "learning_rate": 0.00014239910000087052, + "loss": 1.7384, + "step": 4621 + }, + { + "epoch": 1.4335729915598625, + "grad_norm": 0.27734375, + "learning_rate": 0.00014237684598454267, + "loss": 1.353, + "step": 4622 + }, + { + "epoch": 1.4338855892466396, + "grad_norm": 0.27734375, + "learning_rate": 0.0001423545894098706, + "loss": 1.1421, + "step": 4623 + }, + { + "epoch": 1.4341981869334166, + "grad_norm": 0.26953125, + "learning_rate": 0.00014233233027819803, + "loss": 1.594, + "step": 4624 + }, + { + "epoch": 1.4345107846201939, + "grad_norm": 0.26953125, + "learning_rate": 0.00014231006859086875, + "loss": 1.3478, + "step": 4625 + }, + { + "epoch": 1.434823382306971, + "grad_norm": 0.27734375, + "learning_rate": 0.00014228780434922666, + "loss": 1.642, + "step": 4626 + }, + { + "epoch": 1.435135979993748, + "grad_norm": 0.28125, + "learning_rate": 0.00014226553755461598, + "loss": 1.6561, + "step": 4627 + }, + { + "epoch": 1.4354485776805253, + "grad_norm": 0.28515625, + "learning_rate": 0.0001422432682083809, + "loss": 1.4408, + "step": 4628 + }, + { + "epoch": 1.4357611753673023, + "grad_norm": 0.26953125, + "learning_rate": 0.0001422209963118659, + "loss": 1.8463, + "step": 4629 + }, + { + "epoch": 1.4360737730540793, + "grad_norm": 0.2734375, + "learning_rate": 0.00014219872186641558, + "loss": 1.6674, + "step": 4630 + }, + { + "epoch": 1.4363863707408564, + "grad_norm": 0.275390625, + "learning_rate": 0.00014217644487337462, + "loss": 1.5182, + "step": 4631 + }, + { + "epoch": 1.4366989684276337, + "grad_norm": 0.267578125, + "learning_rate": 0.00014215416533408794, + "loss": 1.541, + "step": 4632 + }, + { + "epoch": 1.4370115661144107, + "grad_norm": 0.27734375, + "learning_rate": 0.00014213188324990058, + "loss": 1.5117, + "step": 4633 + }, + { + "epoch": 1.437324163801188, + "grad_norm": 0.2734375, + "learning_rate": 0.00014210959862215775, + "loss": 1.9105, + "step": 4634 + }, + { + "epoch": 1.437636761487965, + "grad_norm": 0.267578125, + "learning_rate": 0.00014208731145220476, + "loss": 1.8691, + "step": 4635 + }, + { + "epoch": 1.437949359174742, + "grad_norm": 0.2734375, + "learning_rate": 0.00014206502174138718, + "loss": 1.4839, + "step": 4636 + }, + { + "epoch": 1.4382619568615191, + "grad_norm": 0.2734375, + "learning_rate": 0.0001420427294910506, + "loss": 1.6767, + "step": 4637 + }, + { + "epoch": 1.4385745545482964, + "grad_norm": 0.283203125, + "learning_rate": 0.0001420204347025409, + "loss": 1.6293, + "step": 4638 + }, + { + "epoch": 1.4388871522350735, + "grad_norm": 0.28515625, + "learning_rate": 0.000141998137377204, + "loss": 1.6073, + "step": 4639 + }, + { + "epoch": 1.4391997499218505, + "grad_norm": 0.263671875, + "learning_rate": 0.00014197583751638602, + "loss": 1.7099, + "step": 4640 + }, + { + "epoch": 1.4395123476086278, + "grad_norm": 0.27734375, + "learning_rate": 0.00014195353512143322, + "loss": 1.5057, + "step": 4641 + }, + { + "epoch": 1.4398249452954048, + "grad_norm": 0.291015625, + "learning_rate": 0.0001419312301936921, + "loss": 1.6119, + "step": 4642 + }, + { + "epoch": 1.4401375429821819, + "grad_norm": 0.279296875, + "learning_rate": 0.00014190892273450913, + "loss": 1.4598, + "step": 4643 + }, + { + "epoch": 1.440450140668959, + "grad_norm": 0.275390625, + "learning_rate": 0.0001418866127452311, + "loss": 1.3623, + "step": 4644 + }, + { + "epoch": 1.4407627383557362, + "grad_norm": 0.298828125, + "learning_rate": 0.0001418643002272049, + "loss": 1.5883, + "step": 4645 + }, + { + "epoch": 1.4410753360425133, + "grad_norm": 0.279296875, + "learning_rate": 0.00014184198518177752, + "loss": 1.7833, + "step": 4646 + }, + { + "epoch": 1.4413879337292905, + "grad_norm": 0.296875, + "learning_rate": 0.00014181966761029618, + "loss": 1.8222, + "step": 4647 + }, + { + "epoch": 1.4417005314160676, + "grad_norm": 0.263671875, + "learning_rate": 0.0001417973475141082, + "loss": 1.6506, + "step": 4648 + }, + { + "epoch": 1.4420131291028446, + "grad_norm": 0.287109375, + "learning_rate": 0.00014177502489456112, + "loss": 1.4809, + "step": 4649 + }, + { + "epoch": 1.4423257267896217, + "grad_norm": 0.267578125, + "learning_rate": 0.00014175269975300257, + "loss": 1.6961, + "step": 4650 + }, + { + "epoch": 1.442638324476399, + "grad_norm": 0.294921875, + "learning_rate": 0.0001417303720907803, + "loss": 1.7772, + "step": 4651 + }, + { + "epoch": 1.442950922163176, + "grad_norm": 0.26953125, + "learning_rate": 0.00014170804190924226, + "loss": 1.4788, + "step": 4652 + }, + { + "epoch": 1.443263519849953, + "grad_norm": 0.26953125, + "learning_rate": 0.00014168570920973663, + "loss": 1.6487, + "step": 4653 + }, + { + "epoch": 1.4435761175367303, + "grad_norm": 0.265625, + "learning_rate": 0.0001416633739936116, + "loss": 1.3923, + "step": 4654 + }, + { + "epoch": 1.4438887152235074, + "grad_norm": 0.275390625, + "learning_rate": 0.00014164103626221558, + "loss": 1.5774, + "step": 4655 + }, + { + "epoch": 1.4442013129102844, + "grad_norm": 0.265625, + "learning_rate": 0.00014161869601689715, + "loss": 1.5656, + "step": 4656 + }, + { + "epoch": 1.4445139105970615, + "grad_norm": 0.279296875, + "learning_rate": 0.000141596353259005, + "loss": 1.3998, + "step": 4657 + }, + { + "epoch": 1.4448265082838387, + "grad_norm": 0.271484375, + "learning_rate": 0.000141574007989888, + "loss": 1.4597, + "step": 4658 + }, + { + "epoch": 1.4451391059706158, + "grad_norm": 0.279296875, + "learning_rate": 0.00014155166021089514, + "loss": 1.5555, + "step": 4659 + }, + { + "epoch": 1.445451703657393, + "grad_norm": 0.25390625, + "learning_rate": 0.00014152930992337564, + "loss": 1.5083, + "step": 4660 + }, + { + "epoch": 1.44576430134417, + "grad_norm": 0.26171875, + "learning_rate": 0.00014150695712867873, + "loss": 1.526, + "step": 4661 + }, + { + "epoch": 1.4460768990309472, + "grad_norm": 0.283203125, + "learning_rate": 0.00014148460182815398, + "loss": 1.6716, + "step": 4662 + }, + { + "epoch": 1.4463894967177242, + "grad_norm": 0.279296875, + "learning_rate": 0.0001414622440231509, + "loss": 1.4131, + "step": 4663 + }, + { + "epoch": 1.4467020944045015, + "grad_norm": 0.265625, + "learning_rate": 0.00014143988371501936, + "loss": 1.52, + "step": 4664 + }, + { + "epoch": 1.4470146920912785, + "grad_norm": 0.271484375, + "learning_rate": 0.0001414175209051092, + "loss": 1.5605, + "step": 4665 + }, + { + "epoch": 1.4473272897780556, + "grad_norm": 0.265625, + "learning_rate": 0.00014139515559477057, + "loss": 1.3161, + "step": 4666 + }, + { + "epoch": 1.4476398874648329, + "grad_norm": 0.271484375, + "learning_rate": 0.0001413727877853536, + "loss": 1.3972, + "step": 4667 + }, + { + "epoch": 1.44795248515161, + "grad_norm": 0.25390625, + "learning_rate": 0.0001413504174782087, + "loss": 1.4792, + "step": 4668 + }, + { + "epoch": 1.448265082838387, + "grad_norm": 0.26953125, + "learning_rate": 0.00014132804467468642, + "loss": 1.474, + "step": 4669 + }, + { + "epoch": 1.448577680525164, + "grad_norm": 0.2890625, + "learning_rate": 0.00014130566937613743, + "loss": 1.4871, + "step": 4670 + }, + { + "epoch": 1.4488902782119413, + "grad_norm": 0.2890625, + "learning_rate": 0.00014128329158391253, + "loss": 1.5615, + "step": 4671 + }, + { + "epoch": 1.4492028758987183, + "grad_norm": 0.2734375, + "learning_rate": 0.00014126091129936275, + "loss": 1.6239, + "step": 4672 + }, + { + "epoch": 1.4495154735854956, + "grad_norm": 0.275390625, + "learning_rate": 0.0001412385285238391, + "loss": 1.4107, + "step": 4673 + }, + { + "epoch": 1.4498280712722726, + "grad_norm": 0.283203125, + "learning_rate": 0.00014121614325869302, + "loss": 1.4174, + "step": 4674 + }, + { + "epoch": 1.4501406689590497, + "grad_norm": 0.2890625, + "learning_rate": 0.00014119375550527581, + "loss": 1.6103, + "step": 4675 + }, + { + "epoch": 1.4504532666458267, + "grad_norm": 0.26171875, + "learning_rate": 0.00014117136526493903, + "loss": 1.4591, + "step": 4676 + }, + { + "epoch": 1.450765864332604, + "grad_norm": 0.27734375, + "learning_rate": 0.00014114897253903453, + "loss": 1.7177, + "step": 4677 + }, + { + "epoch": 1.451078462019381, + "grad_norm": 0.271484375, + "learning_rate": 0.00014112657732891413, + "loss": 1.6403, + "step": 4678 + }, + { + "epoch": 1.4513910597061581, + "grad_norm": 0.27734375, + "learning_rate": 0.00014110417963592983, + "loss": 1.3365, + "step": 4679 + }, + { + "epoch": 1.4517036573929354, + "grad_norm": 0.279296875, + "learning_rate": 0.0001410817794614338, + "loss": 1.7381, + "step": 4680 + }, + { + "epoch": 1.4520162550797124, + "grad_norm": 0.2734375, + "learning_rate": 0.0001410593768067784, + "loss": 1.7538, + "step": 4681 + }, + { + "epoch": 1.4523288527664895, + "grad_norm": 0.26171875, + "learning_rate": 0.00014103697167331612, + "loss": 1.5666, + "step": 4682 + }, + { + "epoch": 1.4526414504532665, + "grad_norm": 0.26171875, + "learning_rate": 0.00014101456406239956, + "loss": 1.3204, + "step": 4683 + }, + { + "epoch": 1.4529540481400438, + "grad_norm": 0.28125, + "learning_rate": 0.0001409921539753815, + "loss": 1.5704, + "step": 4684 + }, + { + "epoch": 1.4532666458268209, + "grad_norm": 0.357421875, + "learning_rate": 0.00014096974141361484, + "loss": 2.0966, + "step": 4685 + }, + { + "epoch": 1.4535792435135981, + "grad_norm": 0.271484375, + "learning_rate": 0.00014094732637845274, + "loss": 1.3494, + "step": 4686 + }, + { + "epoch": 1.4538918412003752, + "grad_norm": 0.2890625, + "learning_rate": 0.00014092490887124832, + "loss": 1.5677, + "step": 4687 + }, + { + "epoch": 1.4542044388871522, + "grad_norm": 0.279296875, + "learning_rate": 0.000140902488893355, + "loss": 1.6437, + "step": 4688 + }, + { + "epoch": 1.4545170365739293, + "grad_norm": 0.267578125, + "learning_rate": 0.00014088006644612634, + "loss": 1.4874, + "step": 4689 + }, + { + "epoch": 1.4548296342607066, + "grad_norm": 0.287109375, + "learning_rate": 0.00014085764153091597, + "loss": 1.4281, + "step": 4690 + }, + { + "epoch": 1.4551422319474836, + "grad_norm": 0.28515625, + "learning_rate": 0.00014083521414907766, + "loss": 1.7058, + "step": 4691 + }, + { + "epoch": 1.4554548296342606, + "grad_norm": 0.27734375, + "learning_rate": 0.00014081278430196553, + "loss": 1.5046, + "step": 4692 + }, + { + "epoch": 1.455767427321038, + "grad_norm": 0.271484375, + "learning_rate": 0.0001407903519909335, + "loss": 1.5766, + "step": 4693 + }, + { + "epoch": 1.456080025007815, + "grad_norm": 0.28515625, + "learning_rate": 0.000140767917217336, + "loss": 1.5645, + "step": 4694 + }, + { + "epoch": 1.456392622694592, + "grad_norm": 0.265625, + "learning_rate": 0.00014074547998252742, + "loss": 1.4359, + "step": 4695 + }, + { + "epoch": 1.456705220381369, + "grad_norm": 0.28515625, + "learning_rate": 0.00014072304028786223, + "loss": 1.5674, + "step": 4696 + }, + { + "epoch": 1.4570178180681463, + "grad_norm": 0.267578125, + "learning_rate": 0.0001407005981346952, + "loss": 1.4606, + "step": 4697 + }, + { + "epoch": 1.4573304157549234, + "grad_norm": 0.28125, + "learning_rate": 0.00014067815352438123, + "loss": 1.4547, + "step": 4698 + }, + { + "epoch": 1.4576430134417007, + "grad_norm": 0.287109375, + "learning_rate": 0.00014065570645827526, + "loss": 1.688, + "step": 4699 + }, + { + "epoch": 1.4579556111284777, + "grad_norm": 0.28125, + "learning_rate": 0.00014063325693773252, + "loss": 1.6072, + "step": 4700 + }, + { + "epoch": 1.4582682088152548, + "grad_norm": 0.29296875, + "learning_rate": 0.00014061080496410825, + "loss": 1.5536, + "step": 4701 + }, + { + "epoch": 1.4585808065020318, + "grad_norm": 0.2734375, + "learning_rate": 0.00014058835053875796, + "loss": 1.676, + "step": 4702 + }, + { + "epoch": 1.458893404188809, + "grad_norm": 0.28515625, + "learning_rate": 0.0001405658936630372, + "loss": 1.7725, + "step": 4703 + }, + { + "epoch": 1.4592060018755861, + "grad_norm": 0.291015625, + "learning_rate": 0.00014054343433830176, + "loss": 1.7091, + "step": 4704 + }, + { + "epoch": 1.4595185995623632, + "grad_norm": 0.349609375, + "learning_rate": 0.00014052097256590752, + "loss": 2.3213, + "step": 4705 + }, + { + "epoch": 1.4598311972491405, + "grad_norm": 0.275390625, + "learning_rate": 0.00014049850834721054, + "loss": 1.3884, + "step": 4706 + }, + { + "epoch": 1.4601437949359175, + "grad_norm": 0.28515625, + "learning_rate": 0.000140476041683567, + "loss": 1.5953, + "step": 4707 + }, + { + "epoch": 1.4604563926226946, + "grad_norm": 0.26171875, + "learning_rate": 0.00014045357257633323, + "loss": 1.5026, + "step": 4708 + }, + { + "epoch": 1.4607689903094716, + "grad_norm": 0.28125, + "learning_rate": 0.00014043110102686574, + "loss": 1.512, + "step": 4709 + }, + { + "epoch": 1.4610815879962489, + "grad_norm": 0.287109375, + "learning_rate": 0.0001404086270365212, + "loss": 1.5442, + "step": 4710 + }, + { + "epoch": 1.461394185683026, + "grad_norm": 0.2890625, + "learning_rate": 0.00014038615060665626, + "loss": 1.5655, + "step": 4711 + }, + { + "epoch": 1.4617067833698032, + "grad_norm": 0.275390625, + "learning_rate": 0.00014036367173862803, + "loss": 1.6321, + "step": 4712 + }, + { + "epoch": 1.4620193810565802, + "grad_norm": 0.2734375, + "learning_rate": 0.00014034119043379348, + "loss": 1.3139, + "step": 4713 + }, + { + "epoch": 1.4623319787433573, + "grad_norm": 0.283203125, + "learning_rate": 0.00014031870669350984, + "loss": 1.6902, + "step": 4714 + }, + { + "epoch": 1.4626445764301343, + "grad_norm": 0.271484375, + "learning_rate": 0.0001402962205191345, + "loss": 1.5025, + "step": 4715 + }, + { + "epoch": 1.4629571741169114, + "grad_norm": 0.2734375, + "learning_rate": 0.000140273731912025, + "loss": 1.6473, + "step": 4716 + }, + { + "epoch": 1.4632697718036887, + "grad_norm": 0.279296875, + "learning_rate": 0.00014025124087353892, + "loss": 1.4133, + "step": 4717 + }, + { + "epoch": 1.4635823694904657, + "grad_norm": 0.265625, + "learning_rate": 0.0001402287474050342, + "loss": 1.3423, + "step": 4718 + }, + { + "epoch": 1.463894967177243, + "grad_norm": 0.2890625, + "learning_rate": 0.00014020625150786874, + "loss": 1.6132, + "step": 4719 + }, + { + "epoch": 1.46420756486402, + "grad_norm": 0.271484375, + "learning_rate": 0.0001401837531834006, + "loss": 1.4984, + "step": 4720 + }, + { + "epoch": 1.464520162550797, + "grad_norm": 0.275390625, + "learning_rate": 0.0001401612524329881, + "loss": 1.5169, + "step": 4721 + }, + { + "epoch": 1.4648327602375741, + "grad_norm": 0.275390625, + "learning_rate": 0.0001401387492579896, + "loss": 1.5537, + "step": 4722 + }, + { + "epoch": 1.4651453579243514, + "grad_norm": 0.291015625, + "learning_rate": 0.00014011624365976364, + "loss": 1.8074, + "step": 4723 + }, + { + "epoch": 1.4654579556111285, + "grad_norm": 0.26953125, + "learning_rate": 0.000140093735639669, + "loss": 1.6142, + "step": 4724 + }, + { + "epoch": 1.4657705532979057, + "grad_norm": 0.296875, + "learning_rate": 0.00014007122519906436, + "loss": 1.7389, + "step": 4725 + }, + { + "epoch": 1.4660831509846828, + "grad_norm": 0.271484375, + "learning_rate": 0.00014004871233930883, + "loss": 1.8496, + "step": 4726 + }, + { + "epoch": 1.4663957486714598, + "grad_norm": 0.291015625, + "learning_rate": 0.0001400261970617615, + "loss": 1.3808, + "step": 4727 + }, + { + "epoch": 1.4667083463582369, + "grad_norm": 0.296875, + "learning_rate": 0.00014000367936778166, + "loss": 1.4361, + "step": 4728 + }, + { + "epoch": 1.467020944045014, + "grad_norm": 0.275390625, + "learning_rate": 0.00013998115925872867, + "loss": 1.2945, + "step": 4729 + }, + { + "epoch": 1.4673335417317912, + "grad_norm": 0.28515625, + "learning_rate": 0.00013995863673596225, + "loss": 1.8561, + "step": 4730 + }, + { + "epoch": 1.4676461394185683, + "grad_norm": 0.279296875, + "learning_rate": 0.0001399361118008419, + "loss": 1.3074, + "step": 4731 + }, + { + "epoch": 1.4679587371053455, + "grad_norm": 0.279296875, + "learning_rate": 0.00013991358445472764, + "loss": 1.5621, + "step": 4732 + }, + { + "epoch": 1.4682713347921226, + "grad_norm": 0.27734375, + "learning_rate": 0.0001398910546989794, + "loss": 1.3776, + "step": 4733 + }, + { + "epoch": 1.4685839324788996, + "grad_norm": 0.265625, + "learning_rate": 0.00013986852253495738, + "loss": 1.4626, + "step": 4734 + }, + { + "epoch": 1.4688965301656767, + "grad_norm": 0.28515625, + "learning_rate": 0.00013984598796402183, + "loss": 1.7205, + "step": 4735 + }, + { + "epoch": 1.469209127852454, + "grad_norm": 0.29296875, + "learning_rate": 0.00013982345098753325, + "loss": 1.3697, + "step": 4736 + }, + { + "epoch": 1.469521725539231, + "grad_norm": 0.396484375, + "learning_rate": 0.00013980091160685216, + "loss": 2.0519, + "step": 4737 + }, + { + "epoch": 1.4698343232260083, + "grad_norm": 0.287109375, + "learning_rate": 0.0001397783698233393, + "loss": 1.5893, + "step": 4738 + }, + { + "epoch": 1.4701469209127853, + "grad_norm": 0.2890625, + "learning_rate": 0.0001397558256383556, + "loss": 1.7312, + "step": 4739 + }, + { + "epoch": 1.4704595185995624, + "grad_norm": 0.28125, + "learning_rate": 0.00013973327905326204, + "loss": 1.4599, + "step": 4740 + }, + { + "epoch": 1.4707721162863394, + "grad_norm": 0.283203125, + "learning_rate": 0.00013971073006941974, + "loss": 1.5603, + "step": 4741 + }, + { + "epoch": 1.4710847139731165, + "grad_norm": 0.271484375, + "learning_rate": 0.00013968817868819012, + "loss": 1.5871, + "step": 4742 + }, + { + "epoch": 1.4713973116598937, + "grad_norm": 0.279296875, + "learning_rate": 0.00013966562491093455, + "loss": 1.7609, + "step": 4743 + }, + { + "epoch": 1.4717099093466708, + "grad_norm": 0.271484375, + "learning_rate": 0.00013964306873901467, + "loss": 1.5724, + "step": 4744 + }, + { + "epoch": 1.472022507033448, + "grad_norm": 0.27734375, + "learning_rate": 0.00013962051017379218, + "loss": 1.6633, + "step": 4745 + }, + { + "epoch": 1.472335104720225, + "grad_norm": 0.265625, + "learning_rate": 0.00013959794921662906, + "loss": 1.5911, + "step": 4746 + }, + { + "epoch": 1.4726477024070022, + "grad_norm": 0.291015625, + "learning_rate": 0.00013957538586888724, + "loss": 1.642, + "step": 4747 + }, + { + "epoch": 1.4729603000937792, + "grad_norm": 0.283203125, + "learning_rate": 0.00013955282013192898, + "loss": 1.4164, + "step": 4748 + }, + { + "epoch": 1.4732728977805565, + "grad_norm": 0.357421875, + "learning_rate": 0.00013953025200711652, + "loss": 2.1398, + "step": 4749 + }, + { + "epoch": 1.4735854954673335, + "grad_norm": 0.283203125, + "learning_rate": 0.00013950768149581242, + "loss": 1.5324, + "step": 4750 + }, + { + "epoch": 1.4738980931541106, + "grad_norm": 0.287109375, + "learning_rate": 0.00013948510859937917, + "loss": 1.5179, + "step": 4751 + }, + { + "epoch": 1.4742106908408878, + "grad_norm": 0.28515625, + "learning_rate": 0.00013946253331917967, + "loss": 1.6216, + "step": 4752 + }, + { + "epoch": 1.474523288527665, + "grad_norm": 0.2890625, + "learning_rate": 0.0001394399556565767, + "loss": 1.7629, + "step": 4753 + }, + { + "epoch": 1.474835886214442, + "grad_norm": 0.26953125, + "learning_rate": 0.00013941737561293339, + "loss": 1.3801, + "step": 4754 + }, + { + "epoch": 1.475148483901219, + "grad_norm": 0.267578125, + "learning_rate": 0.00013939479318961286, + "loss": 1.2722, + "step": 4755 + }, + { + "epoch": 1.4754610815879963, + "grad_norm": 0.271484375, + "learning_rate": 0.0001393722083879785, + "loss": 1.5343, + "step": 4756 + }, + { + "epoch": 1.4757736792747733, + "grad_norm": 0.271484375, + "learning_rate": 0.00013934962120939367, + "loss": 1.4174, + "step": 4757 + }, + { + "epoch": 1.4760862769615506, + "grad_norm": 0.271484375, + "learning_rate": 0.0001393270316552221, + "loss": 1.6169, + "step": 4758 + }, + { + "epoch": 1.4763988746483276, + "grad_norm": 0.287109375, + "learning_rate": 0.00013930443972682755, + "loss": 1.5633, + "step": 4759 + }, + { + "epoch": 1.4767114723351047, + "grad_norm": 0.287109375, + "learning_rate": 0.00013928184542557386, + "loss": 1.5436, + "step": 4760 + }, + { + "epoch": 1.4770240700218817, + "grad_norm": 0.283203125, + "learning_rate": 0.0001392592487528251, + "loss": 1.4456, + "step": 4761 + }, + { + "epoch": 1.477336667708659, + "grad_norm": 0.291015625, + "learning_rate": 0.00013923664970994548, + "loss": 1.7102, + "step": 4762 + }, + { + "epoch": 1.477649265395436, + "grad_norm": 0.283203125, + "learning_rate": 0.0001392140482982993, + "loss": 1.2565, + "step": 4763 + }, + { + "epoch": 1.4779618630822131, + "grad_norm": 0.28125, + "learning_rate": 0.00013919144451925107, + "loss": 1.6703, + "step": 4764 + }, + { + "epoch": 1.4782744607689904, + "grad_norm": 0.27734375, + "learning_rate": 0.00013916883837416537, + "loss": 1.6054, + "step": 4765 + }, + { + "epoch": 1.4785870584557674, + "grad_norm": 0.2734375, + "learning_rate": 0.000139146229864407, + "loss": 1.7249, + "step": 4766 + }, + { + "epoch": 1.4788996561425445, + "grad_norm": 0.2890625, + "learning_rate": 0.00013912361899134083, + "loss": 1.3501, + "step": 4767 + }, + { + "epoch": 1.4792122538293215, + "grad_norm": 0.26953125, + "learning_rate": 0.00013910100575633197, + "loss": 1.3183, + "step": 4768 + }, + { + "epoch": 1.4795248515160988, + "grad_norm": 0.27734375, + "learning_rate": 0.0001390783901607455, + "loss": 1.4104, + "step": 4769 + }, + { + "epoch": 1.4798374492028759, + "grad_norm": 0.291015625, + "learning_rate": 0.00013905577220594688, + "loss": 1.5617, + "step": 4770 + }, + { + "epoch": 1.4801500468896531, + "grad_norm": 0.296875, + "learning_rate": 0.0001390331518933015, + "loss": 1.5939, + "step": 4771 + }, + { + "epoch": 1.4804626445764302, + "grad_norm": 0.275390625, + "learning_rate": 0.000139010529224175, + "loss": 1.8937, + "step": 4772 + }, + { + "epoch": 1.4807752422632072, + "grad_norm": 0.2890625, + "learning_rate": 0.00013898790419993314, + "loss": 1.5376, + "step": 4773 + }, + { + "epoch": 1.4810878399499843, + "grad_norm": 0.275390625, + "learning_rate": 0.00013896527682194182, + "loss": 1.6127, + "step": 4774 + }, + { + "epoch": 1.4814004376367615, + "grad_norm": 0.271484375, + "learning_rate": 0.00013894264709156704, + "loss": 1.6904, + "step": 4775 + }, + { + "epoch": 1.4817130353235386, + "grad_norm": 0.265625, + "learning_rate": 0.00013892001501017507, + "loss": 1.2735, + "step": 4776 + }, + { + "epoch": 1.4820256330103156, + "grad_norm": 0.30078125, + "learning_rate": 0.00013889738057913222, + "loss": 1.5661, + "step": 4777 + }, + { + "epoch": 1.482338230697093, + "grad_norm": 0.29296875, + "learning_rate": 0.0001388747437998049, + "loss": 1.5189, + "step": 4778 + }, + { + "epoch": 1.48265082838387, + "grad_norm": 0.26171875, + "learning_rate": 0.00013885210467355977, + "loss": 1.6821, + "step": 4779 + }, + { + "epoch": 1.482963426070647, + "grad_norm": 0.294921875, + "learning_rate": 0.0001388294632017636, + "loss": 1.539, + "step": 4780 + }, + { + "epoch": 1.483276023757424, + "grad_norm": 0.287109375, + "learning_rate": 0.0001388068193857832, + "loss": 1.3788, + "step": 4781 + }, + { + "epoch": 1.4835886214442013, + "grad_norm": 0.287109375, + "learning_rate": 0.0001387841732269857, + "loss": 1.6355, + "step": 4782 + }, + { + "epoch": 1.4839012191309784, + "grad_norm": 0.265625, + "learning_rate": 0.00013876152472673824, + "loss": 1.5469, + "step": 4783 + }, + { + "epoch": 1.4842138168177557, + "grad_norm": 0.275390625, + "learning_rate": 0.00013873887388640813, + "loss": 1.4548, + "step": 4784 + }, + { + "epoch": 1.4845264145045327, + "grad_norm": 0.2734375, + "learning_rate": 0.00013871622070736283, + "loss": 1.6311, + "step": 4785 + }, + { + "epoch": 1.4848390121913098, + "grad_norm": 0.271484375, + "learning_rate": 0.00013869356519096996, + "loss": 1.6191, + "step": 4786 + }, + { + "epoch": 1.4851516098780868, + "grad_norm": 0.28125, + "learning_rate": 0.00013867090733859724, + "loss": 1.6485, + "step": 4787 + }, + { + "epoch": 1.485464207564864, + "grad_norm": 0.291015625, + "learning_rate": 0.00013864824715161258, + "loss": 1.4496, + "step": 4788 + }, + { + "epoch": 1.4857768052516411, + "grad_norm": 0.28125, + "learning_rate": 0.000138625584631384, + "loss": 1.4758, + "step": 4789 + }, + { + "epoch": 1.4860894029384182, + "grad_norm": 0.2890625, + "learning_rate": 0.00013860291977927963, + "loss": 1.5562, + "step": 4790 + }, + { + "epoch": 1.4864020006251955, + "grad_norm": 0.294921875, + "learning_rate": 0.00013858025259666778, + "loss": 1.4644, + "step": 4791 + }, + { + "epoch": 1.4867145983119725, + "grad_norm": 0.26953125, + "learning_rate": 0.00013855758308491697, + "loss": 1.4079, + "step": 4792 + }, + { + "epoch": 1.4870271959987496, + "grad_norm": 0.28125, + "learning_rate": 0.0001385349112453957, + "loss": 1.7239, + "step": 4793 + }, + { + "epoch": 1.4873397936855266, + "grad_norm": 0.2734375, + "learning_rate": 0.00013851223707947273, + "loss": 1.4636, + "step": 4794 + }, + { + "epoch": 1.4876523913723039, + "grad_norm": 0.287109375, + "learning_rate": 0.00013848956058851695, + "loss": 1.7056, + "step": 4795 + }, + { + "epoch": 1.487964989059081, + "grad_norm": 0.27734375, + "learning_rate": 0.00013846688177389735, + "loss": 1.6746, + "step": 4796 + }, + { + "epoch": 1.4882775867458582, + "grad_norm": 0.28515625, + "learning_rate": 0.00013844420063698307, + "loss": 1.9306, + "step": 4797 + }, + { + "epoch": 1.4885901844326352, + "grad_norm": 0.275390625, + "learning_rate": 0.00013842151717914343, + "loss": 1.5027, + "step": 4798 + }, + { + "epoch": 1.4889027821194123, + "grad_norm": 0.275390625, + "learning_rate": 0.0001383988314017478, + "loss": 1.5049, + "step": 4799 + }, + { + "epoch": 1.4892153798061893, + "grad_norm": 0.2734375, + "learning_rate": 0.0001383761433061658, + "loss": 1.4956, + "step": 4800 + }, + { + "epoch": 1.4892153798061893, + "eval_loss": 1.540488600730896, + "eval_runtime": 1909.2842, + "eval_samples_per_second": 4.786, + "eval_steps_per_second": 2.393, + "step": 4800 + }, + { + "epoch": 1.4895279774929666, + "grad_norm": 0.271484375, + "learning_rate": 0.00013835345289376713, + "loss": 1.6199, + "step": 4801 + }, + { + "epoch": 1.4898405751797437, + "grad_norm": 0.279296875, + "learning_rate": 0.00013833076016592162, + "loss": 1.5649, + "step": 4802 + }, + { + "epoch": 1.4901531728665207, + "grad_norm": 0.265625, + "learning_rate": 0.0001383080651239993, + "loss": 1.2639, + "step": 4803 + }, + { + "epoch": 1.490465770553298, + "grad_norm": 0.26953125, + "learning_rate": 0.00013828536776937025, + "loss": 1.4622, + "step": 4804 + }, + { + "epoch": 1.490778368240075, + "grad_norm": 0.279296875, + "learning_rate": 0.00013826266810340477, + "loss": 1.711, + "step": 4805 + }, + { + "epoch": 1.491090965926852, + "grad_norm": 0.28125, + "learning_rate": 0.00013823996612747326, + "loss": 1.8856, + "step": 4806 + }, + { + "epoch": 1.4914035636136291, + "grad_norm": 0.275390625, + "learning_rate": 0.00013821726184294625, + "loss": 1.3782, + "step": 4807 + }, + { + "epoch": 1.4917161613004064, + "grad_norm": 0.34765625, + "learning_rate": 0.00013819455525119448, + "loss": 2.2278, + "step": 4808 + }, + { + "epoch": 1.4920287589871835, + "grad_norm": 0.279296875, + "learning_rate": 0.0001381718463535887, + "loss": 1.5884, + "step": 4809 + }, + { + "epoch": 1.4923413566739607, + "grad_norm": 0.287109375, + "learning_rate": 0.00013814913515149992, + "loss": 1.7534, + "step": 4810 + }, + { + "epoch": 1.4926539543607378, + "grad_norm": 0.267578125, + "learning_rate": 0.00013812642164629923, + "loss": 1.5207, + "step": 4811 + }, + { + "epoch": 1.4929665520475148, + "grad_norm": 0.2734375, + "learning_rate": 0.0001381037058393579, + "loss": 1.9244, + "step": 4812 + }, + { + "epoch": 1.4932791497342919, + "grad_norm": 0.275390625, + "learning_rate": 0.00013808098773204728, + "loss": 1.5499, + "step": 4813 + }, + { + "epoch": 1.4935917474210691, + "grad_norm": 0.275390625, + "learning_rate": 0.0001380582673257389, + "loss": 1.4949, + "step": 4814 + }, + { + "epoch": 1.4939043451078462, + "grad_norm": 0.28125, + "learning_rate": 0.00013803554462180442, + "loss": 1.6817, + "step": 4815 + }, + { + "epoch": 1.4942169427946232, + "grad_norm": 0.2734375, + "learning_rate": 0.0001380128196216157, + "loss": 1.5036, + "step": 4816 + }, + { + "epoch": 1.4945295404814005, + "grad_norm": 0.2734375, + "learning_rate": 0.00013799009232654452, + "loss": 1.7792, + "step": 4817 + }, + { + "epoch": 1.4948421381681776, + "grad_norm": 0.279296875, + "learning_rate": 0.0001379673627379631, + "loss": 1.7147, + "step": 4818 + }, + { + "epoch": 1.4951547358549546, + "grad_norm": 0.27734375, + "learning_rate": 0.00013794463085724362, + "loss": 1.6417, + "step": 4819 + }, + { + "epoch": 1.4954673335417317, + "grad_norm": 0.28125, + "learning_rate": 0.00013792189668575844, + "loss": 1.6789, + "step": 4820 + }, + { + "epoch": 1.495779931228509, + "grad_norm": 0.26171875, + "learning_rate": 0.00013789916022488, + "loss": 1.5019, + "step": 4821 + }, + { + "epoch": 1.496092528915286, + "grad_norm": 0.279296875, + "learning_rate": 0.00013787642147598098, + "loss": 1.3454, + "step": 4822 + }, + { + "epoch": 1.4964051266020633, + "grad_norm": 0.291015625, + "learning_rate": 0.0001378536804404341, + "loss": 1.3393, + "step": 4823 + }, + { + "epoch": 1.4967177242888403, + "grad_norm": 0.283203125, + "learning_rate": 0.00013783093711961232, + "loss": 1.7143, + "step": 4824 + }, + { + "epoch": 1.4970303219756174, + "grad_norm": 0.265625, + "learning_rate": 0.00013780819151488867, + "loss": 1.7801, + "step": 4825 + }, + { + "epoch": 1.4973429196623944, + "grad_norm": 0.275390625, + "learning_rate": 0.0001377854436276363, + "loss": 1.4511, + "step": 4826 + }, + { + "epoch": 1.4976555173491717, + "grad_norm": 0.2578125, + "learning_rate": 0.00013776269345922853, + "loss": 1.5489, + "step": 4827 + }, + { + "epoch": 1.4979681150359487, + "grad_norm": 0.275390625, + "learning_rate": 0.00013773994101103887, + "loss": 1.4367, + "step": 4828 + }, + { + "epoch": 1.4982807127227258, + "grad_norm": 0.27734375, + "learning_rate": 0.00013771718628444084, + "loss": 1.3745, + "step": 4829 + }, + { + "epoch": 1.498593310409503, + "grad_norm": 0.27734375, + "learning_rate": 0.00013769442928080825, + "loss": 1.4794, + "step": 4830 + }, + { + "epoch": 1.49890590809628, + "grad_norm": 0.271484375, + "learning_rate": 0.0001376716700015149, + "loss": 1.4688, + "step": 4831 + }, + { + "epoch": 1.4992185057830572, + "grad_norm": 0.28515625, + "learning_rate": 0.00013764890844793486, + "loss": 1.6018, + "step": 4832 + }, + { + "epoch": 1.4995311034698342, + "grad_norm": 0.27734375, + "learning_rate": 0.00013762614462144222, + "loss": 1.5939, + "step": 4833 + }, + { + "epoch": 1.4998437011566115, + "grad_norm": 0.2890625, + "learning_rate": 0.00013760337852341127, + "loss": 1.4394, + "step": 4834 + }, + { + "epoch": 1.5001562988433885, + "grad_norm": 0.28515625, + "learning_rate": 0.00013758061015521644, + "loss": 1.7241, + "step": 4835 + }, + { + "epoch": 1.5004688965301658, + "grad_norm": 0.28515625, + "learning_rate": 0.00013755783951823231, + "loss": 1.3598, + "step": 4836 + }, + { + "epoch": 1.5007814942169428, + "grad_norm": 0.291015625, + "learning_rate": 0.00013753506661383353, + "loss": 1.6162, + "step": 4837 + }, + { + "epoch": 1.50109409190372, + "grad_norm": 0.271484375, + "learning_rate": 0.00013751229144339498, + "loss": 1.8877, + "step": 4838 + }, + { + "epoch": 1.501406689590497, + "grad_norm": 0.28515625, + "learning_rate": 0.00013748951400829152, + "loss": 1.4028, + "step": 4839 + }, + { + "epoch": 1.501719287277274, + "grad_norm": 0.2734375, + "learning_rate": 0.0001374667343098984, + "loss": 1.4623, + "step": 4840 + }, + { + "epoch": 1.5020318849640513, + "grad_norm": 0.271484375, + "learning_rate": 0.00013744395234959074, + "loss": 1.4917, + "step": 4841 + }, + { + "epoch": 1.5023444826508285, + "grad_norm": 0.283203125, + "learning_rate": 0.00013742116812874398, + "loss": 1.7893, + "step": 4842 + }, + { + "epoch": 1.5026570803376056, + "grad_norm": 0.279296875, + "learning_rate": 0.00013739838164873361, + "loss": 1.8276, + "step": 4843 + }, + { + "epoch": 1.5029696780243826, + "grad_norm": 0.2890625, + "learning_rate": 0.00013737559291093532, + "loss": 1.3587, + "step": 4844 + }, + { + "epoch": 1.5032822757111597, + "grad_norm": 0.275390625, + "learning_rate": 0.0001373528019167248, + "loss": 1.4224, + "step": 4845 + }, + { + "epoch": 1.5035948733979367, + "grad_norm": 0.28125, + "learning_rate": 0.00013733000866747805, + "loss": 1.5353, + "step": 4846 + }, + { + "epoch": 1.503907471084714, + "grad_norm": 0.26171875, + "learning_rate": 0.0001373072131645711, + "loss": 1.4161, + "step": 4847 + }, + { + "epoch": 1.504220068771491, + "grad_norm": 0.28125, + "learning_rate": 0.00013728441540938015, + "loss": 1.6566, + "step": 4848 + }, + { + "epoch": 1.5045326664582683, + "grad_norm": 0.298828125, + "learning_rate": 0.00013726161540328154, + "loss": 1.7235, + "step": 4849 + }, + { + "epoch": 1.5048452641450454, + "grad_norm": 0.275390625, + "learning_rate": 0.00013723881314765173, + "loss": 1.4253, + "step": 4850 + }, + { + "epoch": 1.5051578618318224, + "grad_norm": 0.265625, + "learning_rate": 0.0001372160086438673, + "loss": 1.4144, + "step": 4851 + }, + { + "epoch": 1.5054704595185995, + "grad_norm": 0.29296875, + "learning_rate": 0.00013719320189330502, + "loss": 1.4077, + "step": 4852 + }, + { + "epoch": 1.5057830572053765, + "grad_norm": 0.28515625, + "learning_rate": 0.00013717039289734174, + "loss": 1.3204, + "step": 4853 + }, + { + "epoch": 1.5060956548921538, + "grad_norm": 0.279296875, + "learning_rate": 0.00013714758165735445, + "loss": 1.701, + "step": 4854 + }, + { + "epoch": 1.506408252578931, + "grad_norm": 0.26171875, + "learning_rate": 0.00013712476817472036, + "loss": 1.6931, + "step": 4855 + }, + { + "epoch": 1.5067208502657081, + "grad_norm": 0.28125, + "learning_rate": 0.00013710195245081666, + "loss": 1.3077, + "step": 4856 + }, + { + "epoch": 1.5070334479524852, + "grad_norm": 0.26953125, + "learning_rate": 0.00013707913448702085, + "loss": 1.6396, + "step": 4857 + }, + { + "epoch": 1.5073460456392622, + "grad_norm": 0.28125, + "learning_rate": 0.00013705631428471046, + "loss": 1.3695, + "step": 4858 + }, + { + "epoch": 1.5076586433260393, + "grad_norm": 0.279296875, + "learning_rate": 0.0001370334918452631, + "loss": 1.6431, + "step": 4859 + }, + { + "epoch": 1.5079712410128165, + "grad_norm": 0.275390625, + "learning_rate": 0.00013701066717005669, + "loss": 1.639, + "step": 4860 + }, + { + "epoch": 1.5082838386995936, + "grad_norm": 0.275390625, + "learning_rate": 0.00013698784026046912, + "loss": 1.587, + "step": 4861 + }, + { + "epoch": 1.5085964363863709, + "grad_norm": 0.279296875, + "learning_rate": 0.00013696501111787847, + "loss": 1.3576, + "step": 4862 + }, + { + "epoch": 1.508909034073148, + "grad_norm": 0.2734375, + "learning_rate": 0.00013694217974366305, + "loss": 1.8408, + "step": 4863 + }, + { + "epoch": 1.509221631759925, + "grad_norm": 0.279296875, + "learning_rate": 0.00013691934613920112, + "loss": 1.4075, + "step": 4864 + }, + { + "epoch": 1.509534229446702, + "grad_norm": 0.296875, + "learning_rate": 0.00013689651030587122, + "loss": 1.5765, + "step": 4865 + }, + { + "epoch": 1.509846827133479, + "grad_norm": 0.28125, + "learning_rate": 0.000136873672245052, + "loss": 1.4989, + "step": 4866 + }, + { + "epoch": 1.5101594248202563, + "grad_norm": 0.26953125, + "learning_rate": 0.00013685083195812218, + "loss": 1.2015, + "step": 4867 + }, + { + "epoch": 1.5104720225070336, + "grad_norm": 0.2890625, + "learning_rate": 0.00013682798944646067, + "loss": 1.5619, + "step": 4868 + }, + { + "epoch": 1.5107846201938107, + "grad_norm": 0.279296875, + "learning_rate": 0.00013680514471144654, + "loss": 1.8652, + "step": 4869 + }, + { + "epoch": 1.5110972178805877, + "grad_norm": 0.29296875, + "learning_rate": 0.0001367822977544589, + "loss": 1.4711, + "step": 4870 + }, + { + "epoch": 1.5114098155673648, + "grad_norm": 0.265625, + "learning_rate": 0.00013675944857687708, + "loss": 1.6081, + "step": 4871 + }, + { + "epoch": 1.5117224132541418, + "grad_norm": 0.28515625, + "learning_rate": 0.00013673659718008046, + "loss": 1.7525, + "step": 4872 + }, + { + "epoch": 1.512035010940919, + "grad_norm": 0.271484375, + "learning_rate": 0.00013671374356544872, + "loss": 1.422, + "step": 4873 + }, + { + "epoch": 1.5123476086276961, + "grad_norm": 0.27734375, + "learning_rate": 0.00013669088773436144, + "loss": 1.496, + "step": 4874 + }, + { + "epoch": 1.5126602063144734, + "grad_norm": 0.287109375, + "learning_rate": 0.00013666802968819857, + "loss": 2.1415, + "step": 4875 + }, + { + "epoch": 1.5129728040012504, + "grad_norm": 0.28125, + "learning_rate": 0.00013664516942833997, + "loss": 1.4566, + "step": 4876 + }, + { + "epoch": 1.5132854016880275, + "grad_norm": 0.26953125, + "learning_rate": 0.00013662230695616584, + "loss": 1.3616, + "step": 4877 + }, + { + "epoch": 1.5135979993748045, + "grad_norm": 0.2890625, + "learning_rate": 0.00013659944227305634, + "loss": 1.3566, + "step": 4878 + }, + { + "epoch": 1.5139105970615816, + "grad_norm": 0.279296875, + "learning_rate": 0.0001365765753803919, + "loss": 1.4325, + "step": 4879 + }, + { + "epoch": 1.5142231947483589, + "grad_norm": 0.27734375, + "learning_rate": 0.00013655370627955294, + "loss": 1.4522, + "step": 4880 + }, + { + "epoch": 1.5145357924351361, + "grad_norm": 0.28125, + "learning_rate": 0.0001365308349719202, + "loss": 1.3943, + "step": 4881 + }, + { + "epoch": 1.5148483901219132, + "grad_norm": 0.2734375, + "learning_rate": 0.0001365079614588744, + "loss": 1.5389, + "step": 4882 + }, + { + "epoch": 1.5151609878086902, + "grad_norm": 0.294921875, + "learning_rate": 0.0001364850857417964, + "loss": 1.6869, + "step": 4883 + }, + { + "epoch": 1.5154735854954673, + "grad_norm": 0.28125, + "learning_rate": 0.00013646220782206732, + "loss": 1.5724, + "step": 4884 + }, + { + "epoch": 1.5157861831822443, + "grad_norm": 0.27734375, + "learning_rate": 0.00013643932770106827, + "loss": 1.6731, + "step": 4885 + }, + { + "epoch": 1.5160987808690216, + "grad_norm": 0.28125, + "learning_rate": 0.00013641644538018056, + "loss": 1.5248, + "step": 4886 + }, + { + "epoch": 1.5164113785557987, + "grad_norm": 0.279296875, + "learning_rate": 0.00013639356086078566, + "loss": 1.5507, + "step": 4887 + }, + { + "epoch": 1.516723976242576, + "grad_norm": 0.2890625, + "learning_rate": 0.00013637067414426512, + "loss": 1.5547, + "step": 4888 + }, + { + "epoch": 1.517036573929353, + "grad_norm": 0.27734375, + "learning_rate": 0.00013634778523200064, + "loss": 1.4582, + "step": 4889 + }, + { + "epoch": 1.51734917161613, + "grad_norm": 0.26953125, + "learning_rate": 0.000136324894125374, + "loss": 1.5639, + "step": 4890 + }, + { + "epoch": 1.517661769302907, + "grad_norm": 0.28515625, + "learning_rate": 0.00013630200082576728, + "loss": 1.7309, + "step": 4891 + }, + { + "epoch": 1.5179743669896841, + "grad_norm": 0.283203125, + "learning_rate": 0.00013627910533456244, + "loss": 1.5389, + "step": 4892 + }, + { + "epoch": 1.5182869646764614, + "grad_norm": 0.28515625, + "learning_rate": 0.00013625620765314182, + "loss": 1.4838, + "step": 4893 + }, + { + "epoch": 1.5185995623632387, + "grad_norm": 0.28125, + "learning_rate": 0.00013623330778288775, + "loss": 1.484, + "step": 4894 + }, + { + "epoch": 1.5189121600500157, + "grad_norm": 0.29296875, + "learning_rate": 0.0001362104057251827, + "loss": 1.5483, + "step": 4895 + }, + { + "epoch": 1.5192247577367928, + "grad_norm": 0.3359375, + "learning_rate": 0.0001361875014814093, + "loss": 2.0742, + "step": 4896 + }, + { + "epoch": 1.5195373554235698, + "grad_norm": 0.287109375, + "learning_rate": 0.00013616459505295036, + "loss": 1.7894, + "step": 4897 + }, + { + "epoch": 1.5198499531103469, + "grad_norm": 0.30078125, + "learning_rate": 0.0001361416864411887, + "loss": 1.6285, + "step": 4898 + }, + { + "epoch": 1.5201625507971241, + "grad_norm": 0.279296875, + "learning_rate": 0.0001361187756475074, + "loss": 1.4062, + "step": 4899 + }, + { + "epoch": 1.5204751484839012, + "grad_norm": 0.28515625, + "learning_rate": 0.00013609586267328955, + "loss": 1.5702, + "step": 4900 + }, + { + "epoch": 1.5207877461706785, + "grad_norm": 0.279296875, + "learning_rate": 0.0001360729475199185, + "loss": 1.7382, + "step": 4901 + }, + { + "epoch": 1.5211003438574555, + "grad_norm": 0.296875, + "learning_rate": 0.0001360500301887776, + "loss": 1.634, + "step": 4902 + }, + { + "epoch": 1.5214129415442326, + "grad_norm": 0.3046875, + "learning_rate": 0.0001360271106812505, + "loss": 1.8597, + "step": 4903 + }, + { + "epoch": 1.5217255392310096, + "grad_norm": 0.279296875, + "learning_rate": 0.0001360041889987208, + "loss": 1.5893, + "step": 4904 + }, + { + "epoch": 1.5220381369177867, + "grad_norm": 0.28125, + "learning_rate": 0.0001359812651425723, + "loss": 1.3249, + "step": 4905 + }, + { + "epoch": 1.522350734604564, + "grad_norm": 0.2734375, + "learning_rate": 0.00013595833911418897, + "loss": 1.4003, + "step": 4906 + }, + { + "epoch": 1.5226633322913412, + "grad_norm": 0.294921875, + "learning_rate": 0.00013593541091495495, + "loss": 1.4114, + "step": 4907 + }, + { + "epoch": 1.5229759299781183, + "grad_norm": 0.298828125, + "learning_rate": 0.00013591248054625434, + "loss": 1.6106, + "step": 4908 + }, + { + "epoch": 1.5232885276648953, + "grad_norm": 0.26953125, + "learning_rate": 0.0001358895480094715, + "loss": 1.6379, + "step": 4909 + }, + { + "epoch": 1.5236011253516724, + "grad_norm": 0.29296875, + "learning_rate": 0.00013586661330599093, + "loss": 1.5513, + "step": 4910 + }, + { + "epoch": 1.5239137230384494, + "grad_norm": 0.275390625, + "learning_rate": 0.0001358436764371972, + "loss": 1.776, + "step": 4911 + }, + { + "epoch": 1.5242263207252267, + "grad_norm": 0.26953125, + "learning_rate": 0.00013582073740447506, + "loss": 1.4984, + "step": 4912 + }, + { + "epoch": 1.5245389184120037, + "grad_norm": 0.279296875, + "learning_rate": 0.00013579779620920935, + "loss": 1.4635, + "step": 4913 + }, + { + "epoch": 1.524851516098781, + "grad_norm": 0.28125, + "learning_rate": 0.00013577485285278505, + "loss": 1.6161, + "step": 4914 + }, + { + "epoch": 1.525164113785558, + "grad_norm": 0.271484375, + "learning_rate": 0.0001357519073365873, + "loss": 1.6316, + "step": 4915 + }, + { + "epoch": 1.525476711472335, + "grad_norm": 0.283203125, + "learning_rate": 0.00013572895966200137, + "loss": 1.6609, + "step": 4916 + }, + { + "epoch": 1.5257893091591122, + "grad_norm": 0.30078125, + "learning_rate": 0.00013570600983041258, + "loss": 1.7015, + "step": 4917 + }, + { + "epoch": 1.5261019068458892, + "grad_norm": 0.271484375, + "learning_rate": 0.0001356830578432065, + "loss": 1.6981, + "step": 4918 + }, + { + "epoch": 1.5264145045326665, + "grad_norm": 0.291015625, + "learning_rate": 0.00013566010370176876, + "loss": 1.9733, + "step": 4919 + }, + { + "epoch": 1.5267271022194435, + "grad_norm": 0.27734375, + "learning_rate": 0.00013563714740748507, + "loss": 1.6131, + "step": 4920 + }, + { + "epoch": 1.5270396999062208, + "grad_norm": 0.294921875, + "learning_rate": 0.00013561418896174143, + "loss": 1.6744, + "step": 4921 + }, + { + "epoch": 1.5273522975929978, + "grad_norm": 0.283203125, + "learning_rate": 0.00013559122836592378, + "loss": 1.6179, + "step": 4922 + }, + { + "epoch": 1.527664895279775, + "grad_norm": 0.2734375, + "learning_rate": 0.00013556826562141833, + "loss": 1.6257, + "step": 4923 + }, + { + "epoch": 1.527977492966552, + "grad_norm": 0.275390625, + "learning_rate": 0.00013554530072961137, + "loss": 1.4059, + "step": 4924 + }, + { + "epoch": 1.5282900906533292, + "grad_norm": 0.28125, + "learning_rate": 0.00013552233369188934, + "loss": 1.6328, + "step": 4925 + }, + { + "epoch": 1.5286026883401063, + "grad_norm": 0.291015625, + "learning_rate": 0.0001354993645096387, + "loss": 1.4355, + "step": 4926 + }, + { + "epoch": 1.5289152860268835, + "grad_norm": 0.27734375, + "learning_rate": 0.00013547639318424622, + "loss": 1.3874, + "step": 4927 + }, + { + "epoch": 1.5292278837136606, + "grad_norm": 0.279296875, + "learning_rate": 0.00013545341971709865, + "loss": 1.658, + "step": 4928 + }, + { + "epoch": 1.5295404814004376, + "grad_norm": 0.291015625, + "learning_rate": 0.00013543044410958295, + "loss": 1.6181, + "step": 4929 + }, + { + "epoch": 1.5298530790872147, + "grad_norm": 0.279296875, + "learning_rate": 0.00013540746636308624, + "loss": 1.5662, + "step": 4930 + }, + { + "epoch": 1.5301656767739917, + "grad_norm": 0.28515625, + "learning_rate": 0.00013538448647899563, + "loss": 1.5986, + "step": 4931 + }, + { + "epoch": 1.530478274460769, + "grad_norm": 0.275390625, + "learning_rate": 0.00013536150445869847, + "loss": 1.4048, + "step": 4932 + }, + { + "epoch": 1.530790872147546, + "grad_norm": 0.2890625, + "learning_rate": 0.00013533852030358224, + "loss": 1.7174, + "step": 4933 + }, + { + "epoch": 1.5311034698343233, + "grad_norm": 0.287109375, + "learning_rate": 0.0001353155340150345, + "loss": 1.513, + "step": 4934 + }, + { + "epoch": 1.5314160675211004, + "grad_norm": 0.26953125, + "learning_rate": 0.000135292545594443, + "loss": 1.6644, + "step": 4935 + }, + { + "epoch": 1.5317286652078774, + "grad_norm": 0.283203125, + "learning_rate": 0.0001352695550431955, + "loss": 1.6251, + "step": 4936 + }, + { + "epoch": 1.5320412628946545, + "grad_norm": 0.27734375, + "learning_rate": 0.00013524656236268005, + "loss": 1.4987, + "step": 4937 + }, + { + "epoch": 1.5323538605814317, + "grad_norm": 0.28125, + "learning_rate": 0.0001352235675542847, + "loss": 1.4943, + "step": 4938 + }, + { + "epoch": 1.5326664582682088, + "grad_norm": 0.279296875, + "learning_rate": 0.0001352005706193977, + "loss": 1.571, + "step": 4939 + }, + { + "epoch": 1.532979055954986, + "grad_norm": 0.28515625, + "learning_rate": 0.00013517757155940736, + "loss": 1.687, + "step": 4940 + }, + { + "epoch": 1.5332916536417631, + "grad_norm": 0.28125, + "learning_rate": 0.00013515457037570222, + "loss": 1.5219, + "step": 4941 + }, + { + "epoch": 1.5336042513285402, + "grad_norm": 0.283203125, + "learning_rate": 0.0001351315670696709, + "loss": 1.7764, + "step": 4942 + }, + { + "epoch": 1.5339168490153172, + "grad_norm": 0.28125, + "learning_rate": 0.0001351085616427021, + "loss": 1.6955, + "step": 4943 + }, + { + "epoch": 1.5342294467020943, + "grad_norm": 0.27734375, + "learning_rate": 0.00013508555409618466, + "loss": 1.4089, + "step": 4944 + }, + { + "epoch": 1.5345420443888715, + "grad_norm": 0.302734375, + "learning_rate": 0.00013506254443150764, + "loss": 1.5426, + "step": 4945 + }, + { + "epoch": 1.5348546420756486, + "grad_norm": 0.28125, + "learning_rate": 0.0001350395326500601, + "loss": 1.8171, + "step": 4946 + }, + { + "epoch": 1.5351672397624259, + "grad_norm": 0.287109375, + "learning_rate": 0.00013501651875323133, + "loss": 1.4028, + "step": 4947 + }, + { + "epoch": 1.535479837449203, + "grad_norm": 0.291015625, + "learning_rate": 0.00013499350274241074, + "loss": 1.8051, + "step": 4948 + }, + { + "epoch": 1.53579243513598, + "grad_norm": 0.291015625, + "learning_rate": 0.00013497048461898775, + "loss": 1.8259, + "step": 4949 + }, + { + "epoch": 1.536105032822757, + "grad_norm": 0.294921875, + "learning_rate": 0.00013494746438435205, + "loss": 1.4848, + "step": 4950 + }, + { + "epoch": 1.5364176305095343, + "grad_norm": 0.2890625, + "learning_rate": 0.0001349244420398934, + "loss": 1.506, + "step": 4951 + }, + { + "epoch": 1.5367302281963113, + "grad_norm": 0.265625, + "learning_rate": 0.00013490141758700167, + "loss": 1.4098, + "step": 4952 + }, + { + "epoch": 1.5370428258830886, + "grad_norm": 0.2890625, + "learning_rate": 0.00013487839102706693, + "loss": 1.6048, + "step": 4953 + }, + { + "epoch": 1.5373554235698657, + "grad_norm": 0.2734375, + "learning_rate": 0.00013485536236147925, + "loss": 1.7807, + "step": 4954 + }, + { + "epoch": 1.5376680212566427, + "grad_norm": 0.279296875, + "learning_rate": 0.00013483233159162892, + "loss": 1.6532, + "step": 4955 + }, + { + "epoch": 1.5379806189434198, + "grad_norm": 0.279296875, + "learning_rate": 0.00013480929871890633, + "loss": 1.5882, + "step": 4956 + }, + { + "epoch": 1.5382932166301968, + "grad_norm": 0.28125, + "learning_rate": 0.00013478626374470202, + "loss": 1.3998, + "step": 4957 + }, + { + "epoch": 1.538605814316974, + "grad_norm": 0.28125, + "learning_rate": 0.00013476322667040663, + "loss": 1.5972, + "step": 4958 + }, + { + "epoch": 1.5389184120037511, + "grad_norm": 0.27734375, + "learning_rate": 0.000134740187497411, + "loss": 1.8103, + "step": 4959 + }, + { + "epoch": 1.5392310096905284, + "grad_norm": 0.283203125, + "learning_rate": 0.00013471714622710596, + "loss": 1.5846, + "step": 4960 + }, + { + "epoch": 1.5395436073773054, + "grad_norm": 0.28515625, + "learning_rate": 0.00013469410286088255, + "loss": 1.2591, + "step": 4961 + }, + { + "epoch": 1.5398562050640825, + "grad_norm": 0.291015625, + "learning_rate": 0.00013467105740013193, + "loss": 1.6226, + "step": 4962 + }, + { + "epoch": 1.5401688027508595, + "grad_norm": 0.2890625, + "learning_rate": 0.00013464800984624542, + "loss": 1.5573, + "step": 4963 + }, + { + "epoch": 1.5404814004376368, + "grad_norm": 0.263671875, + "learning_rate": 0.00013462496020061438, + "loss": 1.7478, + "step": 4964 + }, + { + "epoch": 1.5407939981244139, + "grad_norm": 0.2890625, + "learning_rate": 0.00013460190846463035, + "loss": 1.522, + "step": 4965 + }, + { + "epoch": 1.5411065958111911, + "grad_norm": 0.265625, + "learning_rate": 0.00013457885463968508, + "loss": 1.8658, + "step": 4966 + }, + { + "epoch": 1.5414191934979682, + "grad_norm": 0.28515625, + "learning_rate": 0.00013455579872717025, + "loss": 1.6142, + "step": 4967 + }, + { + "epoch": 1.5417317911847452, + "grad_norm": 0.28125, + "learning_rate": 0.0001345327407284778, + "loss": 1.5669, + "step": 4968 + }, + { + "epoch": 1.5420443888715223, + "grad_norm": 0.28125, + "learning_rate": 0.0001345096806449998, + "loss": 1.5322, + "step": 4969 + }, + { + "epoch": 1.5423569865582993, + "grad_norm": 0.267578125, + "learning_rate": 0.0001344866184781284, + "loss": 1.3081, + "step": 4970 + }, + { + "epoch": 1.5426695842450766, + "grad_norm": 0.28515625, + "learning_rate": 0.00013446355422925592, + "loss": 1.6975, + "step": 4971 + }, + { + "epoch": 1.5429821819318537, + "grad_norm": 0.294921875, + "learning_rate": 0.00013444048789977472, + "loss": 1.695, + "step": 4972 + }, + { + "epoch": 1.543294779618631, + "grad_norm": 0.26171875, + "learning_rate": 0.0001344174194910774, + "loss": 1.7535, + "step": 4973 + }, + { + "epoch": 1.543607377305408, + "grad_norm": 0.29296875, + "learning_rate": 0.0001343943490045566, + "loss": 1.4863, + "step": 4974 + }, + { + "epoch": 1.543919974992185, + "grad_norm": 0.2734375, + "learning_rate": 0.0001343712764416051, + "loss": 1.4841, + "step": 4975 + }, + { + "epoch": 1.544232572678962, + "grad_norm": 0.26171875, + "learning_rate": 0.00013434820180361587, + "loss": 1.6745, + "step": 4976 + }, + { + "epoch": 1.5445451703657391, + "grad_norm": 0.291015625, + "learning_rate": 0.00013432512509198196, + "loss": 1.4136, + "step": 4977 + }, + { + "epoch": 1.5448577680525164, + "grad_norm": 0.28125, + "learning_rate": 0.00013430204630809645, + "loss": 1.6532, + "step": 4978 + }, + { + "epoch": 1.5451703657392937, + "grad_norm": 0.294921875, + "learning_rate": 0.00013427896545335273, + "loss": 1.765, + "step": 4979 + }, + { + "epoch": 1.5454829634260707, + "grad_norm": 0.26953125, + "learning_rate": 0.00013425588252914415, + "loss": 1.6788, + "step": 4980 + }, + { + "epoch": 1.5457955611128478, + "grad_norm": 0.28515625, + "learning_rate": 0.0001342327975368643, + "loss": 1.5697, + "step": 4981 + }, + { + "epoch": 1.5461081587996248, + "grad_norm": 0.275390625, + "learning_rate": 0.00013420971047790683, + "loss": 1.566, + "step": 4982 + }, + { + "epoch": 1.5464207564864019, + "grad_norm": 0.291015625, + "learning_rate": 0.00013418662135366557, + "loss": 1.6018, + "step": 4983 + }, + { + "epoch": 1.5467333541731791, + "grad_norm": 0.279296875, + "learning_rate": 0.0001341635301655344, + "loss": 1.5691, + "step": 4984 + }, + { + "epoch": 1.5470459518599562, + "grad_norm": 0.28515625, + "learning_rate": 0.0001341404369149074, + "loss": 1.509, + "step": 4985 + }, + { + "epoch": 1.5473585495467335, + "grad_norm": 0.2890625, + "learning_rate": 0.00013411734160317866, + "loss": 1.6771, + "step": 4986 + }, + { + "epoch": 1.5476711472335105, + "grad_norm": 0.28125, + "learning_rate": 0.00013409424423174257, + "loss": 1.4246, + "step": 4987 + }, + { + "epoch": 1.5479837449202876, + "grad_norm": 0.279296875, + "learning_rate": 0.00013407114480199349, + "loss": 1.589, + "step": 4988 + }, + { + "epoch": 1.5482963426070646, + "grad_norm": 0.26953125, + "learning_rate": 0.00013404804331532605, + "loss": 1.4727, + "step": 4989 + }, + { + "epoch": 1.5486089402938417, + "grad_norm": 0.2734375, + "learning_rate": 0.00013402493977313478, + "loss": 1.6676, + "step": 4990 + }, + { + "epoch": 1.548921537980619, + "grad_norm": 0.279296875, + "learning_rate": 0.00013400183417681456, + "loss": 1.4347, + "step": 4991 + }, + { + "epoch": 1.5492341356673962, + "grad_norm": 0.265625, + "learning_rate": 0.00013397872652776025, + "loss": 1.4311, + "step": 4992 + }, + { + "epoch": 1.5495467333541733, + "grad_norm": 0.28515625, + "learning_rate": 0.00013395561682736694, + "loss": 1.3854, + "step": 4993 + }, + { + "epoch": 1.5498593310409503, + "grad_norm": 0.275390625, + "learning_rate": 0.00013393250507702978, + "loss": 1.513, + "step": 4994 + }, + { + "epoch": 1.5501719287277274, + "grad_norm": 0.291015625, + "learning_rate": 0.0001339093912781441, + "loss": 1.6341, + "step": 4995 + }, + { + "epoch": 1.5504845264145044, + "grad_norm": 0.279296875, + "learning_rate": 0.00013388627543210515, + "loss": 1.5025, + "step": 4996 + }, + { + "epoch": 1.5507971241012817, + "grad_norm": 0.279296875, + "learning_rate": 0.00013386315754030864, + "loss": 1.6215, + "step": 4997 + }, + { + "epoch": 1.5511097217880587, + "grad_norm": 0.275390625, + "learning_rate": 0.0001338400376041501, + "loss": 1.6972, + "step": 4998 + }, + { + "epoch": 1.551422319474836, + "grad_norm": 0.279296875, + "learning_rate": 0.00013381691562502543, + "loss": 1.7307, + "step": 4999 + }, + { + "epoch": 1.551734917161613, + "grad_norm": 0.296875, + "learning_rate": 0.00013379379160433045, + "loss": 1.9134, + "step": 5000 + }, + { + "epoch": 1.55204751484839, + "grad_norm": 0.28515625, + "learning_rate": 0.00013377066554346123, + "loss": 1.6386, + "step": 5001 + }, + { + "epoch": 1.5523601125351671, + "grad_norm": 0.291015625, + "learning_rate": 0.00013374753744381385, + "loss": 1.2689, + "step": 5002 + }, + { + "epoch": 1.5526727102219442, + "grad_norm": 0.28515625, + "learning_rate": 0.00013372440730678465, + "loss": 1.5805, + "step": 5003 + }, + { + "epoch": 1.5529853079087215, + "grad_norm": 0.2890625, + "learning_rate": 0.00013370127513377, + "loss": 1.4609, + "step": 5004 + }, + { + "epoch": 1.5532979055954987, + "grad_norm": 0.283203125, + "learning_rate": 0.00013367814092616644, + "loss": 1.6764, + "step": 5005 + }, + { + "epoch": 1.5536105032822758, + "grad_norm": 0.296875, + "learning_rate": 0.00013365500468537057, + "loss": 1.4582, + "step": 5006 + }, + { + "epoch": 1.5539231009690528, + "grad_norm": 0.28125, + "learning_rate": 0.00013363186641277922, + "loss": 1.6675, + "step": 5007 + }, + { + "epoch": 1.55423569865583, + "grad_norm": 0.2890625, + "learning_rate": 0.0001336087261097892, + "loss": 1.5632, + "step": 5008 + }, + { + "epoch": 1.554548296342607, + "grad_norm": 0.287109375, + "learning_rate": 0.0001335855837777976, + "loss": 1.6013, + "step": 5009 + }, + { + "epoch": 1.5548608940293842, + "grad_norm": 0.267578125, + "learning_rate": 0.00013356243941820144, + "loss": 1.8472, + "step": 5010 + }, + { + "epoch": 1.5551734917161613, + "grad_norm": 0.275390625, + "learning_rate": 0.0001335392930323981, + "loss": 1.6139, + "step": 5011 + }, + { + "epoch": 1.5554860894029385, + "grad_norm": 0.28125, + "learning_rate": 0.00013351614462178487, + "loss": 1.5984, + "step": 5012 + }, + { + "epoch": 1.5557986870897156, + "grad_norm": 0.279296875, + "learning_rate": 0.0001334929941877593, + "loss": 1.5614, + "step": 5013 + }, + { + "epoch": 1.5561112847764926, + "grad_norm": 0.32421875, + "learning_rate": 0.00013346984173171896, + "loss": 2.3071, + "step": 5014 + }, + { + "epoch": 1.5564238824632697, + "grad_norm": 0.296875, + "learning_rate": 0.00013344668725506165, + "loss": 1.3938, + "step": 5015 + }, + { + "epoch": 1.5567364801500467, + "grad_norm": 0.267578125, + "learning_rate": 0.00013342353075918522, + "loss": 1.7061, + "step": 5016 + }, + { + "epoch": 1.557049077836824, + "grad_norm": 0.28125, + "learning_rate": 0.00013340037224548765, + "loss": 1.7717, + "step": 5017 + }, + { + "epoch": 1.5573616755236013, + "grad_norm": 0.28125, + "learning_rate": 0.000133377211715367, + "loss": 1.478, + "step": 5018 + }, + { + "epoch": 1.5576742732103783, + "grad_norm": 0.279296875, + "learning_rate": 0.0001333540491702216, + "loss": 1.64, + "step": 5019 + }, + { + "epoch": 1.5579868708971554, + "grad_norm": 0.2890625, + "learning_rate": 0.00013333088461144968, + "loss": 1.7206, + "step": 5020 + }, + { + "epoch": 1.5582994685839324, + "grad_norm": 0.275390625, + "learning_rate": 0.00013330771804044984, + "loss": 1.5077, + "step": 5021 + }, + { + "epoch": 1.5586120662707095, + "grad_norm": 0.294921875, + "learning_rate": 0.0001332845494586206, + "loss": 1.9248, + "step": 5022 + }, + { + "epoch": 1.5589246639574867, + "grad_norm": 0.2890625, + "learning_rate": 0.00013326137886736069, + "loss": 1.6623, + "step": 5023 + }, + { + "epoch": 1.5592372616442638, + "grad_norm": 0.287109375, + "learning_rate": 0.00013323820626806896, + "loss": 1.4811, + "step": 5024 + }, + { + "epoch": 1.559549859331041, + "grad_norm": 0.265625, + "learning_rate": 0.00013321503166214435, + "loss": 1.5143, + "step": 5025 + }, + { + "epoch": 1.5598624570178181, + "grad_norm": 0.279296875, + "learning_rate": 0.00013319185505098597, + "loss": 1.5872, + "step": 5026 + }, + { + "epoch": 1.5601750547045952, + "grad_norm": 0.267578125, + "learning_rate": 0.000133168676435993, + "loss": 1.4517, + "step": 5027 + }, + { + "epoch": 1.5604876523913722, + "grad_norm": 0.27734375, + "learning_rate": 0.00013314549581856474, + "loss": 1.3147, + "step": 5028 + }, + { + "epoch": 1.5608002500781493, + "grad_norm": 0.275390625, + "learning_rate": 0.00013312231320010068, + "loss": 1.5296, + "step": 5029 + }, + { + "epoch": 1.5611128477649265, + "grad_norm": 0.2578125, + "learning_rate": 0.00013309912858200037, + "loss": 1.4328, + "step": 5030 + }, + { + "epoch": 1.5614254454517038, + "grad_norm": 0.28125, + "learning_rate": 0.00013307594196566348, + "loss": 1.3094, + "step": 5031 + }, + { + "epoch": 1.5617380431384809, + "grad_norm": 0.275390625, + "learning_rate": 0.00013305275335248983, + "loss": 1.3946, + "step": 5032 + }, + { + "epoch": 1.562050640825258, + "grad_norm": 0.298828125, + "learning_rate": 0.00013302956274387933, + "loss": 1.3143, + "step": 5033 + }, + { + "epoch": 1.562363238512035, + "grad_norm": 0.28125, + "learning_rate": 0.00013300637014123206, + "loss": 1.4089, + "step": 5034 + }, + { + "epoch": 1.562675836198812, + "grad_norm": 0.2734375, + "learning_rate": 0.00013298317554594815, + "loss": 1.808, + "step": 5035 + }, + { + "epoch": 1.5629884338855893, + "grad_norm": 0.279296875, + "learning_rate": 0.00013295997895942788, + "loss": 1.4552, + "step": 5036 + }, + { + "epoch": 1.5633010315723663, + "grad_norm": 0.291015625, + "learning_rate": 0.00013293678038307172, + "loss": 1.5808, + "step": 5037 + }, + { + "epoch": 1.5636136292591436, + "grad_norm": 0.28515625, + "learning_rate": 0.00013291357981828013, + "loss": 1.6731, + "step": 5038 + }, + { + "epoch": 1.5639262269459207, + "grad_norm": 0.265625, + "learning_rate": 0.0001328903772664538, + "loss": 1.7043, + "step": 5039 + }, + { + "epoch": 1.5642388246326977, + "grad_norm": 0.29296875, + "learning_rate": 0.00013286717272899346, + "loss": 1.4325, + "step": 5040 + }, + { + "epoch": 1.5645514223194747, + "grad_norm": 0.283203125, + "learning_rate": 0.00013284396620730002, + "loss": 1.4672, + "step": 5041 + }, + { + "epoch": 1.5648640200062518, + "grad_norm": 0.26171875, + "learning_rate": 0.0001328207577027745, + "loss": 1.602, + "step": 5042 + }, + { + "epoch": 1.565176617693029, + "grad_norm": 0.2734375, + "learning_rate": 0.000132797547216818, + "loss": 1.2986, + "step": 5043 + }, + { + "epoch": 1.5654892153798063, + "grad_norm": 0.271484375, + "learning_rate": 0.00013277433475083182, + "loss": 1.3907, + "step": 5044 + }, + { + "epoch": 1.5658018130665834, + "grad_norm": 0.2890625, + "learning_rate": 0.00013275112030621724, + "loss": 1.4888, + "step": 5045 + }, + { + "epoch": 1.5661144107533604, + "grad_norm": 0.287109375, + "learning_rate": 0.00013272790388437579, + "loss": 1.5324, + "step": 5046 + }, + { + "epoch": 1.5664270084401375, + "grad_norm": 0.267578125, + "learning_rate": 0.00013270468548670913, + "loss": 1.4203, + "step": 5047 + }, + { + "epoch": 1.5667396061269145, + "grad_norm": 0.26953125, + "learning_rate": 0.0001326814651146189, + "loss": 1.6704, + "step": 5048 + }, + { + "epoch": 1.5670522038136918, + "grad_norm": 0.271484375, + "learning_rate": 0.00013265824276950696, + "loss": 1.4298, + "step": 5049 + }, + { + "epoch": 1.5673648015004689, + "grad_norm": 0.267578125, + "learning_rate": 0.00013263501845277528, + "loss": 1.2549, + "step": 5050 + }, + { + "epoch": 1.5676773991872461, + "grad_norm": 0.279296875, + "learning_rate": 0.000132611792165826, + "loss": 1.4979, + "step": 5051 + }, + { + "epoch": 1.5679899968740232, + "grad_norm": 0.2890625, + "learning_rate": 0.0001325885639100612, + "loss": 1.5943, + "step": 5052 + }, + { + "epoch": 1.5683025945608002, + "grad_norm": 0.265625, + "learning_rate": 0.00013256533368688334, + "loss": 1.5182, + "step": 5053 + }, + { + "epoch": 1.5686151922475773, + "grad_norm": 0.302734375, + "learning_rate": 0.00013254210149769475, + "loss": 1.413, + "step": 5054 + }, + { + "epoch": 1.5689277899343543, + "grad_norm": 0.28125, + "learning_rate": 0.000132518867343898, + "loss": 1.7251, + "step": 5055 + }, + { + "epoch": 1.5692403876211316, + "grad_norm": 0.306640625, + "learning_rate": 0.00013249563122689584, + "loss": 1.774, + "step": 5056 + }, + { + "epoch": 1.5695529853079089, + "grad_norm": 0.263671875, + "learning_rate": 0.000132472393148091, + "loss": 1.4336, + "step": 5057 + }, + { + "epoch": 1.569865582994686, + "grad_norm": 0.2734375, + "learning_rate": 0.00013244915310888636, + "loss": 1.551, + "step": 5058 + }, + { + "epoch": 1.570178180681463, + "grad_norm": 0.279296875, + "learning_rate": 0.00013242591111068506, + "loss": 1.5781, + "step": 5059 + }, + { + "epoch": 1.57049077836824, + "grad_norm": 0.279296875, + "learning_rate": 0.00013240266715489017, + "loss": 1.2934, + "step": 5060 + }, + { + "epoch": 1.570803376055017, + "grad_norm": 0.27734375, + "learning_rate": 0.0001323794212429049, + "loss": 1.2185, + "step": 5061 + }, + { + "epoch": 1.5711159737417943, + "grad_norm": 0.28515625, + "learning_rate": 0.0001323561733761328, + "loss": 1.4913, + "step": 5062 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.26953125, + "learning_rate": 0.00013233292355597725, + "loss": 1.7294, + "step": 5063 + }, + { + "epoch": 1.5717411691153487, + "grad_norm": 0.271484375, + "learning_rate": 0.0001323096717838419, + "loss": 1.6615, + "step": 5064 + }, + { + "epoch": 1.5720537668021257, + "grad_norm": 0.392578125, + "learning_rate": 0.0001322864180611305, + "loss": 2.0632, + "step": 5065 + }, + { + "epoch": 1.5723663644889028, + "grad_norm": 0.27734375, + "learning_rate": 0.0001322631623892469, + "loss": 1.6026, + "step": 5066 + }, + { + "epoch": 1.5726789621756798, + "grad_norm": 0.279296875, + "learning_rate": 0.00013223990476959505, + "loss": 1.3494, + "step": 5067 + }, + { + "epoch": 1.5729915598624569, + "grad_norm": 0.28125, + "learning_rate": 0.0001322166452035791, + "loss": 1.6345, + "step": 5068 + }, + { + "epoch": 1.5733041575492341, + "grad_norm": 0.275390625, + "learning_rate": 0.00013219338369260317, + "loss": 1.3887, + "step": 5069 + }, + { + "epoch": 1.5736167552360114, + "grad_norm": 0.27734375, + "learning_rate": 0.0001321701202380717, + "loss": 1.525, + "step": 5070 + }, + { + "epoch": 1.5739293529227885, + "grad_norm": 0.28125, + "learning_rate": 0.00013214685484138903, + "loss": 1.5575, + "step": 5071 + }, + { + "epoch": 1.5742419506095655, + "grad_norm": 0.279296875, + "learning_rate": 0.00013212358750395984, + "loss": 1.4927, + "step": 5072 + }, + { + "epoch": 1.5745545482963426, + "grad_norm": 0.26171875, + "learning_rate": 0.00013210031822718867, + "loss": 1.7897, + "step": 5073 + }, + { + "epoch": 1.5748671459831196, + "grad_norm": 0.27734375, + "learning_rate": 0.0001320770470124804, + "loss": 1.4913, + "step": 5074 + }, + { + "epoch": 1.5751797436698969, + "grad_norm": 0.27734375, + "learning_rate": 0.0001320537738612399, + "loss": 1.5544, + "step": 5075 + }, + { + "epoch": 1.575492341356674, + "grad_norm": 0.28125, + "learning_rate": 0.00013203049877487226, + "loss": 1.5843, + "step": 5076 + }, + { + "epoch": 1.5758049390434512, + "grad_norm": 0.28125, + "learning_rate": 0.0001320072217547826, + "loss": 1.4814, + "step": 5077 + }, + { + "epoch": 1.5761175367302283, + "grad_norm": 0.30078125, + "learning_rate": 0.00013198394280237617, + "loss": 1.5352, + "step": 5078 + }, + { + "epoch": 1.5764301344170053, + "grad_norm": 0.26953125, + "learning_rate": 0.00013196066191905833, + "loss": 1.5874, + "step": 5079 + }, + { + "epoch": 1.5767427321037824, + "grad_norm": 0.279296875, + "learning_rate": 0.00013193737910623463, + "loss": 1.3757, + "step": 5080 + }, + { + "epoch": 1.5770553297905594, + "grad_norm": 0.294921875, + "learning_rate": 0.00013191409436531063, + "loss": 1.9468, + "step": 5081 + }, + { + "epoch": 1.5773679274773367, + "grad_norm": 0.28125, + "learning_rate": 0.00013189080769769208, + "loss": 1.6272, + "step": 5082 + }, + { + "epoch": 1.577680525164114, + "grad_norm": 0.2734375, + "learning_rate": 0.00013186751910478488, + "loss": 1.3636, + "step": 5083 + }, + { + "epoch": 1.577993122850891, + "grad_norm": 0.275390625, + "learning_rate": 0.00013184422858799493, + "loss": 1.6307, + "step": 5084 + }, + { + "epoch": 1.578305720537668, + "grad_norm": 0.283203125, + "learning_rate": 0.00013182093614872827, + "loss": 1.7675, + "step": 5085 + }, + { + "epoch": 1.578618318224445, + "grad_norm": 0.296875, + "learning_rate": 0.0001317976417883912, + "loss": 1.6317, + "step": 5086 + }, + { + "epoch": 1.5789309159112221, + "grad_norm": 0.298828125, + "learning_rate": 0.0001317743455083899, + "loss": 1.7721, + "step": 5087 + }, + { + "epoch": 1.5792435135979994, + "grad_norm": 0.28125, + "learning_rate": 0.00013175104731013096, + "loss": 1.5939, + "step": 5088 + }, + { + "epoch": 1.5795561112847765, + "grad_norm": 0.28125, + "learning_rate": 0.0001317277471950208, + "loss": 1.3737, + "step": 5089 + }, + { + "epoch": 1.5798687089715537, + "grad_norm": 0.28125, + "learning_rate": 0.0001317044451644661, + "loss": 1.7188, + "step": 5090 + }, + { + "epoch": 1.5801813066583308, + "grad_norm": 0.283203125, + "learning_rate": 0.00013168114121987366, + "loss": 1.4267, + "step": 5091 + }, + { + "epoch": 1.5804939043451078, + "grad_norm": 0.26953125, + "learning_rate": 0.00013165783536265034, + "loss": 1.8701, + "step": 5092 + }, + { + "epoch": 1.5808065020318849, + "grad_norm": 0.279296875, + "learning_rate": 0.00013163452759420313, + "loss": 1.7387, + "step": 5093 + }, + { + "epoch": 1.581119099718662, + "grad_norm": 0.275390625, + "learning_rate": 0.0001316112179159392, + "loss": 1.541, + "step": 5094 + }, + { + "epoch": 1.5814316974054392, + "grad_norm": 0.29296875, + "learning_rate": 0.0001315879063292658, + "loss": 1.7659, + "step": 5095 + }, + { + "epoch": 1.5817442950922165, + "grad_norm": 0.28125, + "learning_rate": 0.00013156459283559022, + "loss": 1.6057, + "step": 5096 + }, + { + "epoch": 1.5820568927789935, + "grad_norm": 0.28515625, + "learning_rate": 0.00013154127743631992, + "loss": 1.5351, + "step": 5097 + }, + { + "epoch": 1.5823694904657706, + "grad_norm": 0.283203125, + "learning_rate": 0.00013151796013286253, + "loss": 1.2022, + "step": 5098 + }, + { + "epoch": 1.5826820881525476, + "grad_norm": 0.302734375, + "learning_rate": 0.00013149464092662572, + "loss": 1.5904, + "step": 5099 + }, + { + "epoch": 1.5829946858393247, + "grad_norm": 0.267578125, + "learning_rate": 0.0001314713198190173, + "loss": 1.641, + "step": 5100 + }, + { + "epoch": 1.583307283526102, + "grad_norm": 0.283203125, + "learning_rate": 0.0001314479968114452, + "loss": 1.502, + "step": 5101 + }, + { + "epoch": 1.583619881212879, + "grad_norm": 0.28515625, + "learning_rate": 0.00013142467190531746, + "loss": 1.3542, + "step": 5102 + }, + { + "epoch": 1.5839324788996563, + "grad_norm": 0.287109375, + "learning_rate": 0.00013140134510204222, + "loss": 1.6327, + "step": 5103 + }, + { + "epoch": 1.5842450765864333, + "grad_norm": 0.275390625, + "learning_rate": 0.00013137801640302778, + "loss": 1.8839, + "step": 5104 + }, + { + "epoch": 1.5845576742732104, + "grad_norm": 0.2890625, + "learning_rate": 0.00013135468580968248, + "loss": 1.7045, + "step": 5105 + }, + { + "epoch": 1.5848702719599874, + "grad_norm": 0.2890625, + "learning_rate": 0.0001313313533234149, + "loss": 1.5658, + "step": 5106 + }, + { + "epoch": 1.5851828696467645, + "grad_norm": 0.28515625, + "learning_rate": 0.00013130801894563354, + "loss": 1.4013, + "step": 5107 + }, + { + "epoch": 1.5854954673335417, + "grad_norm": 0.283203125, + "learning_rate": 0.00013128468267774722, + "loss": 1.6291, + "step": 5108 + }, + { + "epoch": 1.585808065020319, + "grad_norm": 0.265625, + "learning_rate": 0.00013126134452116466, + "loss": 1.5818, + "step": 5109 + }, + { + "epoch": 1.586120662707096, + "grad_norm": 0.27734375, + "learning_rate": 0.00013123800447729497, + "loss": 1.6866, + "step": 5110 + }, + { + "epoch": 1.5864332603938731, + "grad_norm": 0.275390625, + "learning_rate": 0.00013121466254754712, + "loss": 1.697, + "step": 5111 + }, + { + "epoch": 1.5867458580806502, + "grad_norm": 0.2734375, + "learning_rate": 0.0001311913187333303, + "loss": 1.6696, + "step": 5112 + }, + { + "epoch": 1.5870584557674272, + "grad_norm": 0.2890625, + "learning_rate": 0.00013116797303605387, + "loss": 1.3883, + "step": 5113 + }, + { + "epoch": 1.5873710534542045, + "grad_norm": 0.27734375, + "learning_rate": 0.00013114462545712715, + "loss": 1.7757, + "step": 5114 + }, + { + "epoch": 1.5876836511409815, + "grad_norm": 0.2890625, + "learning_rate": 0.0001311212759979597, + "loss": 1.6715, + "step": 5115 + }, + { + "epoch": 1.5879962488277588, + "grad_norm": 0.2890625, + "learning_rate": 0.00013109792465996117, + "loss": 1.423, + "step": 5116 + }, + { + "epoch": 1.5883088465145359, + "grad_norm": 0.283203125, + "learning_rate": 0.00013107457144454128, + "loss": 1.6068, + "step": 5117 + }, + { + "epoch": 1.588621444201313, + "grad_norm": 0.318359375, + "learning_rate": 0.00013105121635310996, + "loss": 2.3311, + "step": 5118 + }, + { + "epoch": 1.58893404188809, + "grad_norm": 0.275390625, + "learning_rate": 0.00013102785938707708, + "loss": 1.6465, + "step": 5119 + }, + { + "epoch": 1.589246639574867, + "grad_norm": 0.28125, + "learning_rate": 0.00013100450054785284, + "loss": 1.6986, + "step": 5120 + }, + { + "epoch": 1.5895592372616443, + "grad_norm": 0.28515625, + "learning_rate": 0.00013098113983684735, + "loss": 1.3467, + "step": 5121 + }, + { + "epoch": 1.5898718349484215, + "grad_norm": 0.279296875, + "learning_rate": 0.000130957777255471, + "loss": 1.6384, + "step": 5122 + }, + { + "epoch": 1.5901844326351986, + "grad_norm": 0.27734375, + "learning_rate": 0.00013093441280513415, + "loss": 1.4282, + "step": 5123 + }, + { + "epoch": 1.5904970303219756, + "grad_norm": 0.283203125, + "learning_rate": 0.00013091104648724745, + "loss": 1.5334, + "step": 5124 + }, + { + "epoch": 1.5908096280087527, + "grad_norm": 0.279296875, + "learning_rate": 0.00013088767830322145, + "loss": 1.5452, + "step": 5125 + }, + { + "epoch": 1.5911222256955297, + "grad_norm": 0.283203125, + "learning_rate": 0.00013086430825446694, + "loss": 1.4513, + "step": 5126 + }, + { + "epoch": 1.591434823382307, + "grad_norm": 0.259765625, + "learning_rate": 0.0001308409363423948, + "loss": 1.5732, + "step": 5127 + }, + { + "epoch": 1.591747421069084, + "grad_norm": 0.275390625, + "learning_rate": 0.00013081756256841604, + "loss": 1.9518, + "step": 5128 + }, + { + "epoch": 1.5920600187558613, + "grad_norm": 0.267578125, + "learning_rate": 0.00013079418693394174, + "loss": 1.9115, + "step": 5129 + }, + { + "epoch": 1.5923726164426384, + "grad_norm": 0.275390625, + "learning_rate": 0.00013077080944038318, + "loss": 2.0819, + "step": 5130 + }, + { + "epoch": 1.5926852141294154, + "grad_norm": 0.291015625, + "learning_rate": 0.0001307474300891516, + "loss": 1.6396, + "step": 5131 + }, + { + "epoch": 1.5929978118161925, + "grad_norm": 0.26953125, + "learning_rate": 0.00013072404888165852, + "loss": 1.4752, + "step": 5132 + }, + { + "epoch": 1.5933104095029695, + "grad_norm": 0.271484375, + "learning_rate": 0.0001307006658193154, + "loss": 1.5275, + "step": 5133 + }, + { + "epoch": 1.5936230071897468, + "grad_norm": 0.291015625, + "learning_rate": 0.00013067728090353402, + "loss": 1.7718, + "step": 5134 + }, + { + "epoch": 1.5939356048765239, + "grad_norm": 0.2734375, + "learning_rate": 0.00013065389413572607, + "loss": 1.3918, + "step": 5135 + }, + { + "epoch": 1.5942482025633011, + "grad_norm": 0.27734375, + "learning_rate": 0.00013063050551730351, + "loss": 1.3909, + "step": 5136 + }, + { + "epoch": 1.5945608002500782, + "grad_norm": 0.28125, + "learning_rate": 0.00013060711504967823, + "loss": 1.6993, + "step": 5137 + }, + { + "epoch": 1.5948733979368552, + "grad_norm": 0.283203125, + "learning_rate": 0.00013058372273426247, + "loss": 1.6457, + "step": 5138 + }, + { + "epoch": 1.5951859956236323, + "grad_norm": 0.291015625, + "learning_rate": 0.00013056032857246836, + "loss": 1.4885, + "step": 5139 + }, + { + "epoch": 1.5954985933104096, + "grad_norm": 0.2734375, + "learning_rate": 0.0001305369325657083, + "loss": 1.5207, + "step": 5140 + }, + { + "epoch": 1.5958111909971866, + "grad_norm": 0.275390625, + "learning_rate": 0.00013051353471539465, + "loss": 1.6183, + "step": 5141 + }, + { + "epoch": 1.5961237886839639, + "grad_norm": 0.291015625, + "learning_rate": 0.0001304901350229401, + "loss": 1.3675, + "step": 5142 + }, + { + "epoch": 1.596436386370741, + "grad_norm": 0.2890625, + "learning_rate": 0.0001304667334897572, + "loss": 1.4596, + "step": 5143 + }, + { + "epoch": 1.596748984057518, + "grad_norm": 0.298828125, + "learning_rate": 0.00013044333011725878, + "loss": 1.5626, + "step": 5144 + }, + { + "epoch": 1.597061581744295, + "grad_norm": 0.28515625, + "learning_rate": 0.00013041992490685773, + "loss": 1.4892, + "step": 5145 + }, + { + "epoch": 1.597374179431072, + "grad_norm": 0.28125, + "learning_rate": 0.00013039651785996706, + "loss": 1.3213, + "step": 5146 + }, + { + "epoch": 1.5976867771178493, + "grad_norm": 0.283203125, + "learning_rate": 0.00013037310897799986, + "loss": 1.3564, + "step": 5147 + }, + { + "epoch": 1.5979993748046264, + "grad_norm": 0.30859375, + "learning_rate": 0.00013034969826236937, + "loss": 1.6341, + "step": 5148 + }, + { + "epoch": 1.5983119724914037, + "grad_norm": 0.279296875, + "learning_rate": 0.0001303262857144889, + "loss": 1.611, + "step": 5149 + }, + { + "epoch": 1.5986245701781807, + "grad_norm": 0.2734375, + "learning_rate": 0.00013030287133577195, + "loss": 1.4833, + "step": 5150 + }, + { + "epoch": 1.5989371678649578, + "grad_norm": 0.291015625, + "learning_rate": 0.00013027945512763202, + "loss": 1.4268, + "step": 5151 + }, + { + "epoch": 1.5992497655517348, + "grad_norm": 0.287109375, + "learning_rate": 0.0001302560370914828, + "loss": 1.5177, + "step": 5152 + }, + { + "epoch": 1.599562363238512, + "grad_norm": 0.28125, + "learning_rate": 0.00013023261722873807, + "loss": 1.6612, + "step": 5153 + }, + { + "epoch": 1.5998749609252891, + "grad_norm": 0.28515625, + "learning_rate": 0.00013020919554081173, + "loss": 1.698, + "step": 5154 + }, + { + "epoch": 1.6001875586120664, + "grad_norm": 0.275390625, + "learning_rate": 0.00013018577202911775, + "loss": 1.5155, + "step": 5155 + }, + { + "epoch": 1.6005001562988435, + "grad_norm": 0.30078125, + "learning_rate": 0.00013016234669507024, + "loss": 1.5116, + "step": 5156 + }, + { + "epoch": 1.6008127539856205, + "grad_norm": 0.2890625, + "learning_rate": 0.00013013891954008342, + "loss": 1.55, + "step": 5157 + }, + { + "epoch": 1.6011253516723976, + "grad_norm": 0.287109375, + "learning_rate": 0.00013011549056557163, + "loss": 1.5349, + "step": 5158 + }, + { + "epoch": 1.6014379493591746, + "grad_norm": 0.28515625, + "learning_rate": 0.00013009205977294926, + "loss": 1.561, + "step": 5159 + }, + { + "epoch": 1.6017505470459519, + "grad_norm": 0.2890625, + "learning_rate": 0.00013006862716363098, + "loss": 1.6057, + "step": 5160 + }, + { + "epoch": 1.602063144732729, + "grad_norm": 0.275390625, + "learning_rate": 0.0001300451927390313, + "loss": 1.3553, + "step": 5161 + }, + { + "epoch": 1.6023757424195062, + "grad_norm": 0.271484375, + "learning_rate": 0.00013002175650056504, + "loss": 1.7408, + "step": 5162 + }, + { + "epoch": 1.6026883401062832, + "grad_norm": 0.283203125, + "learning_rate": 0.0001299983184496471, + "loss": 1.5022, + "step": 5163 + }, + { + "epoch": 1.6030009377930603, + "grad_norm": 0.27734375, + "learning_rate": 0.00012997487858769244, + "loss": 1.6159, + "step": 5164 + }, + { + "epoch": 1.6033135354798373, + "grad_norm": 0.2890625, + "learning_rate": 0.00012995143691611616, + "loss": 1.3795, + "step": 5165 + }, + { + "epoch": 1.6036261331666146, + "grad_norm": 0.27734375, + "learning_rate": 0.0001299279934363335, + "loss": 1.6353, + "step": 5166 + }, + { + "epoch": 1.6039387308533917, + "grad_norm": 0.2734375, + "learning_rate": 0.0001299045481497597, + "loss": 1.6614, + "step": 5167 + }, + { + "epoch": 1.604251328540169, + "grad_norm": 0.29296875, + "learning_rate": 0.00012988110105781024, + "loss": 1.5381, + "step": 5168 + }, + { + "epoch": 1.604563926226946, + "grad_norm": 0.28515625, + "learning_rate": 0.0001298576521619006, + "loss": 1.5451, + "step": 5169 + }, + { + "epoch": 1.604876523913723, + "grad_norm": 0.298828125, + "learning_rate": 0.0001298342014634465, + "loss": 1.6875, + "step": 5170 + }, + { + "epoch": 1.6051891216005, + "grad_norm": 0.28515625, + "learning_rate": 0.00012981074896386362, + "loss": 1.6455, + "step": 5171 + }, + { + "epoch": 1.6055017192872771, + "grad_norm": 0.279296875, + "learning_rate": 0.00012978729466456783, + "loss": 1.6731, + "step": 5172 + }, + { + "epoch": 1.6058143169740544, + "grad_norm": 0.271484375, + "learning_rate": 0.0001297638385669751, + "loss": 1.2996, + "step": 5173 + }, + { + "epoch": 1.6061269146608315, + "grad_norm": 0.302734375, + "learning_rate": 0.0001297403806725015, + "loss": 1.5165, + "step": 5174 + }, + { + "epoch": 1.6064395123476087, + "grad_norm": 0.275390625, + "learning_rate": 0.00012971692098256323, + "loss": 1.585, + "step": 5175 + }, + { + "epoch": 1.6067521100343858, + "grad_norm": 0.302734375, + "learning_rate": 0.0001296934594985766, + "loss": 1.3452, + "step": 5176 + }, + { + "epoch": 1.6070647077211628, + "grad_norm": 0.27734375, + "learning_rate": 0.00012966999622195794, + "loss": 1.5152, + "step": 5177 + }, + { + "epoch": 1.6073773054079399, + "grad_norm": 0.2890625, + "learning_rate": 0.00012964653115412383, + "loss": 1.3423, + "step": 5178 + }, + { + "epoch": 1.6076899030947172, + "grad_norm": 0.28125, + "learning_rate": 0.00012962306429649084, + "loss": 1.5529, + "step": 5179 + }, + { + "epoch": 1.6080025007814942, + "grad_norm": 0.271484375, + "learning_rate": 0.0001295995956504757, + "loss": 1.6913, + "step": 5180 + }, + { + "epoch": 1.6083150984682715, + "grad_norm": 0.26953125, + "learning_rate": 0.00012957612521749523, + "loss": 1.7764, + "step": 5181 + }, + { + "epoch": 1.6086276961550485, + "grad_norm": 0.283203125, + "learning_rate": 0.00012955265299896646, + "loss": 1.4887, + "step": 5182 + }, + { + "epoch": 1.6089402938418256, + "grad_norm": 0.27734375, + "learning_rate": 0.00012952917899630633, + "loss": 1.4849, + "step": 5183 + }, + { + "epoch": 1.6092528915286026, + "grad_norm": 0.275390625, + "learning_rate": 0.00012950570321093206, + "loss": 1.6065, + "step": 5184 + }, + { + "epoch": 1.6095654892153797, + "grad_norm": 0.287109375, + "learning_rate": 0.00012948222564426084, + "loss": 1.5114, + "step": 5185 + }, + { + "epoch": 1.609878086902157, + "grad_norm": 0.283203125, + "learning_rate": 0.00012945874629771012, + "loss": 1.3768, + "step": 5186 + }, + { + "epoch": 1.610190684588934, + "grad_norm": 0.279296875, + "learning_rate": 0.00012943526517269734, + "loss": 1.3997, + "step": 5187 + }, + { + "epoch": 1.6105032822757113, + "grad_norm": 0.27734375, + "learning_rate": 0.00012941178227064007, + "loss": 1.6198, + "step": 5188 + }, + { + "epoch": 1.6108158799624883, + "grad_norm": 0.271484375, + "learning_rate": 0.00012938829759295606, + "loss": 1.3631, + "step": 5189 + }, + { + "epoch": 1.6111284776492654, + "grad_norm": 0.275390625, + "learning_rate": 0.00012936481114106307, + "loss": 1.4819, + "step": 5190 + }, + { + "epoch": 1.6114410753360424, + "grad_norm": 0.287109375, + "learning_rate": 0.000129341322916379, + "loss": 1.4377, + "step": 5191 + }, + { + "epoch": 1.6117536730228195, + "grad_norm": 0.271484375, + "learning_rate": 0.00012931783292032187, + "loss": 1.3551, + "step": 5192 + }, + { + "epoch": 1.6120662707095967, + "grad_norm": 0.283203125, + "learning_rate": 0.0001292943411543098, + "loss": 1.5592, + "step": 5193 + }, + { + "epoch": 1.612378868396374, + "grad_norm": 0.26953125, + "learning_rate": 0.00012927084761976104, + "loss": 1.6148, + "step": 5194 + }, + { + "epoch": 1.612691466083151, + "grad_norm": 0.26953125, + "learning_rate": 0.0001292473523180939, + "loss": 1.5892, + "step": 5195 + }, + { + "epoch": 1.613004063769928, + "grad_norm": 0.283203125, + "learning_rate": 0.00012922385525072685, + "loss": 1.5989, + "step": 5196 + }, + { + "epoch": 1.6133166614567052, + "grad_norm": 0.291015625, + "learning_rate": 0.00012920035641907838, + "loss": 1.7232, + "step": 5197 + }, + { + "epoch": 1.6136292591434822, + "grad_norm": 0.275390625, + "learning_rate": 0.00012917685582456722, + "loss": 1.6316, + "step": 5198 + }, + { + "epoch": 1.6139418568302595, + "grad_norm": 0.271484375, + "learning_rate": 0.0001291533534686121, + "loss": 1.3499, + "step": 5199 + }, + { + "epoch": 1.6142544545170365, + "grad_norm": 0.26171875, + "learning_rate": 0.00012912984935263184, + "loss": 1.6058, + "step": 5200 + }, + { + "epoch": 1.6145670522038138, + "grad_norm": 0.294921875, + "learning_rate": 0.0001291063434780455, + "loss": 1.2795, + "step": 5201 + }, + { + "epoch": 1.6148796498905909, + "grad_norm": 0.294921875, + "learning_rate": 0.0001290828358462721, + "loss": 1.438, + "step": 5202 + }, + { + "epoch": 1.615192247577368, + "grad_norm": 0.275390625, + "learning_rate": 0.0001290593264587308, + "loss": 1.5131, + "step": 5203 + }, + { + "epoch": 1.615504845264145, + "grad_norm": 0.3203125, + "learning_rate": 0.00012903581531684098, + "loss": 2.1409, + "step": 5204 + }, + { + "epoch": 1.615817442950922, + "grad_norm": 0.263671875, + "learning_rate": 0.00012901230242202193, + "loss": 1.5455, + "step": 5205 + }, + { + "epoch": 1.6161300406376993, + "grad_norm": 0.283203125, + "learning_rate": 0.00012898878777569328, + "loss": 1.4208, + "step": 5206 + }, + { + "epoch": 1.6164426383244765, + "grad_norm": 0.2890625, + "learning_rate": 0.00012896527137927453, + "loss": 1.6268, + "step": 5207 + }, + { + "epoch": 1.6167552360112536, + "grad_norm": 0.2890625, + "learning_rate": 0.00012894175323418546, + "loss": 1.524, + "step": 5208 + }, + { + "epoch": 1.6170678336980306, + "grad_norm": 0.310546875, + "learning_rate": 0.00012891823334184585, + "loss": 1.6437, + "step": 5209 + }, + { + "epoch": 1.6173804313848077, + "grad_norm": 0.2734375, + "learning_rate": 0.00012889471170367565, + "loss": 1.4686, + "step": 5210 + }, + { + "epoch": 1.6176930290715847, + "grad_norm": 0.275390625, + "learning_rate": 0.00012887118832109485, + "loss": 1.5133, + "step": 5211 + }, + { + "epoch": 1.618005626758362, + "grad_norm": 0.283203125, + "learning_rate": 0.00012884766319552367, + "loss": 1.5129, + "step": 5212 + }, + { + "epoch": 1.618318224445139, + "grad_norm": 0.283203125, + "learning_rate": 0.00012882413632838227, + "loss": 1.5047, + "step": 5213 + }, + { + "epoch": 1.6186308221319163, + "grad_norm": 0.34765625, + "learning_rate": 0.00012880060772109105, + "loss": 2.2321, + "step": 5214 + }, + { + "epoch": 1.6189434198186934, + "grad_norm": 0.283203125, + "learning_rate": 0.00012877707737507044, + "loss": 1.5715, + "step": 5215 + }, + { + "epoch": 1.6192560175054704, + "grad_norm": 0.275390625, + "learning_rate": 0.000128753545291741, + "loss": 1.6164, + "step": 5216 + }, + { + "epoch": 1.6195686151922475, + "grad_norm": 0.294921875, + "learning_rate": 0.00012873001147252334, + "loss": 1.6011, + "step": 5217 + }, + { + "epoch": 1.6198812128790245, + "grad_norm": 0.275390625, + "learning_rate": 0.00012870647591883833, + "loss": 1.4171, + "step": 5218 + }, + { + "epoch": 1.6201938105658018, + "grad_norm": 0.283203125, + "learning_rate": 0.0001286829386321068, + "loss": 1.6437, + "step": 5219 + }, + { + "epoch": 1.620506408252579, + "grad_norm": 0.279296875, + "learning_rate": 0.00012865939961374969, + "loss": 1.6432, + "step": 5220 + }, + { + "epoch": 1.6208190059393561, + "grad_norm": 0.28515625, + "learning_rate": 0.00012863585886518808, + "loss": 1.8146, + "step": 5221 + }, + { + "epoch": 1.6211316036261332, + "grad_norm": 0.279296875, + "learning_rate": 0.0001286123163878432, + "loss": 1.6181, + "step": 5222 + }, + { + "epoch": 1.6214442013129102, + "grad_norm": 0.2734375, + "learning_rate": 0.0001285887721831363, + "loss": 1.7212, + "step": 5223 + }, + { + "epoch": 1.6217567989996873, + "grad_norm": 0.291015625, + "learning_rate": 0.00012856522625248883, + "loss": 1.4609, + "step": 5224 + }, + { + "epoch": 1.6220693966864645, + "grad_norm": 0.2734375, + "learning_rate": 0.00012854167859732222, + "loss": 1.6521, + "step": 5225 + }, + { + "epoch": 1.6223819943732416, + "grad_norm": 0.27734375, + "learning_rate": 0.00012851812921905813, + "loss": 1.6265, + "step": 5226 + }, + { + "epoch": 1.6226945920600189, + "grad_norm": 0.283203125, + "learning_rate": 0.0001284945781191182, + "loss": 1.3805, + "step": 5227 + }, + { + "epoch": 1.623007189746796, + "grad_norm": 0.2734375, + "learning_rate": 0.00012847102529892432, + "loss": 1.5773, + "step": 5228 + }, + { + "epoch": 1.623319787433573, + "grad_norm": 0.27734375, + "learning_rate": 0.00012844747075989833, + "loss": 1.3183, + "step": 5229 + }, + { + "epoch": 1.62363238512035, + "grad_norm": 0.2734375, + "learning_rate": 0.00012842391450346228, + "loss": 1.6123, + "step": 5230 + }, + { + "epoch": 1.623944982807127, + "grad_norm": 0.27734375, + "learning_rate": 0.0001284003565310383, + "loss": 1.763, + "step": 5231 + }, + { + "epoch": 1.6242575804939043, + "grad_norm": 0.3046875, + "learning_rate": 0.00012837679684404862, + "loss": 1.5674, + "step": 5232 + }, + { + "epoch": 1.6245701781806816, + "grad_norm": 0.271484375, + "learning_rate": 0.00012835323544391553, + "loss": 1.6564, + "step": 5233 + }, + { + "epoch": 1.6248827758674587, + "grad_norm": 0.271484375, + "learning_rate": 0.0001283296723320615, + "loss": 1.7379, + "step": 5234 + }, + { + "epoch": 1.6251953735542357, + "grad_norm": 0.2890625, + "learning_rate": 0.00012830610750990906, + "loss": 1.4359, + "step": 5235 + }, + { + "epoch": 1.6255079712410128, + "grad_norm": 0.283203125, + "learning_rate": 0.00012828254097888082, + "loss": 1.461, + "step": 5236 + }, + { + "epoch": 1.6258205689277898, + "grad_norm": 0.2890625, + "learning_rate": 0.00012825897274039956, + "loss": 1.5576, + "step": 5237 + }, + { + "epoch": 1.626133166614567, + "grad_norm": 0.271484375, + "learning_rate": 0.00012823540279588807, + "loss": 1.9163, + "step": 5238 + }, + { + "epoch": 1.6264457643013441, + "grad_norm": 0.265625, + "learning_rate": 0.00012821183114676937, + "loss": 1.4001, + "step": 5239 + }, + { + "epoch": 1.6267583619881214, + "grad_norm": 0.294921875, + "learning_rate": 0.00012818825779446644, + "loss": 1.5625, + "step": 5240 + }, + { + "epoch": 1.6270709596748985, + "grad_norm": 0.298828125, + "learning_rate": 0.00012816468274040246, + "loss": 1.6887, + "step": 5241 + }, + { + "epoch": 1.6273835573616755, + "grad_norm": 0.28125, + "learning_rate": 0.00012814110598600073, + "loss": 1.3923, + "step": 5242 + }, + { + "epoch": 1.6276961550484526, + "grad_norm": 0.294921875, + "learning_rate": 0.00012811752753268455, + "loss": 1.4533, + "step": 5243 + }, + { + "epoch": 1.6280087527352296, + "grad_norm": 0.279296875, + "learning_rate": 0.00012809394738187742, + "loss": 1.2255, + "step": 5244 + }, + { + "epoch": 1.6283213504220069, + "grad_norm": 0.27734375, + "learning_rate": 0.00012807036553500286, + "loss": 1.4549, + "step": 5245 + }, + { + "epoch": 1.6286339481087841, + "grad_norm": 0.287109375, + "learning_rate": 0.00012804678199348457, + "loss": 1.4125, + "step": 5246 + }, + { + "epoch": 1.6289465457955612, + "grad_norm": 0.267578125, + "learning_rate": 0.00012802319675874632, + "loss": 1.7342, + "step": 5247 + }, + { + "epoch": 1.6292591434823382, + "grad_norm": 0.2734375, + "learning_rate": 0.00012799960983221197, + "loss": 1.3691, + "step": 5248 + }, + { + "epoch": 1.6295717411691153, + "grad_norm": 0.279296875, + "learning_rate": 0.0001279760212153055, + "loss": 1.5872, + "step": 5249 + }, + { + "epoch": 1.6298843388558923, + "grad_norm": 0.294921875, + "learning_rate": 0.00012795243090945094, + "loss": 1.7517, + "step": 5250 + }, + { + "epoch": 1.6301969365426696, + "grad_norm": 0.3125, + "learning_rate": 0.00012792883891607257, + "loss": 1.5518, + "step": 5251 + }, + { + "epoch": 1.6305095342294467, + "grad_norm": 0.263671875, + "learning_rate": 0.00012790524523659458, + "loss": 1.6365, + "step": 5252 + }, + { + "epoch": 1.630822131916224, + "grad_norm": 0.283203125, + "learning_rate": 0.00012788164987244133, + "loss": 1.6613, + "step": 5253 + }, + { + "epoch": 1.631134729603001, + "grad_norm": 0.29296875, + "learning_rate": 0.0001278580528250374, + "loss": 1.6701, + "step": 5254 + }, + { + "epoch": 1.631447327289778, + "grad_norm": 0.326171875, + "learning_rate": 0.00012783445409580733, + "loss": 2.2097, + "step": 5255 + }, + { + "epoch": 1.631759924976555, + "grad_norm": 0.28125, + "learning_rate": 0.00012781085368617574, + "loss": 1.5379, + "step": 5256 + }, + { + "epoch": 1.6320725226633321, + "grad_norm": 0.287109375, + "learning_rate": 0.00012778725159756752, + "loss": 1.5629, + "step": 5257 + }, + { + "epoch": 1.6323851203501094, + "grad_norm": 0.283203125, + "learning_rate": 0.0001277636478314075, + "loss": 1.5427, + "step": 5258 + }, + { + "epoch": 1.6326977180368867, + "grad_norm": 0.2734375, + "learning_rate": 0.00012774004238912066, + "loss": 1.4046, + "step": 5259 + }, + { + "epoch": 1.6330103157236637, + "grad_norm": 0.275390625, + "learning_rate": 0.00012771643527213213, + "loss": 1.405, + "step": 5260 + }, + { + "epoch": 1.6333229134104408, + "grad_norm": 0.279296875, + "learning_rate": 0.0001276928264818671, + "loss": 1.6566, + "step": 5261 + }, + { + "epoch": 1.6336355110972178, + "grad_norm": 0.283203125, + "learning_rate": 0.00012766921601975082, + "loss": 1.3281, + "step": 5262 + }, + { + "epoch": 1.6339481087839949, + "grad_norm": 0.28125, + "learning_rate": 0.00012764560388720873, + "loss": 1.5187, + "step": 5263 + }, + { + "epoch": 1.6342607064707722, + "grad_norm": 0.287109375, + "learning_rate": 0.00012762199008566627, + "loss": 1.47, + "step": 5264 + }, + { + "epoch": 1.6345733041575492, + "grad_norm": 0.283203125, + "learning_rate": 0.0001275983746165491, + "loss": 1.565, + "step": 5265 + }, + { + "epoch": 1.6348859018443265, + "grad_norm": 0.2734375, + "learning_rate": 0.00012757475748128287, + "loss": 1.6159, + "step": 5266 + }, + { + "epoch": 1.6351984995311035, + "grad_norm": 0.291015625, + "learning_rate": 0.0001275511386812934, + "loss": 1.446, + "step": 5267 + }, + { + "epoch": 1.6355110972178806, + "grad_norm": 0.27734375, + "learning_rate": 0.00012752751821800657, + "loss": 1.4458, + "step": 5268 + }, + { + "epoch": 1.6358236949046576, + "grad_norm": 0.26953125, + "learning_rate": 0.0001275038960928484, + "loss": 1.6677, + "step": 5269 + }, + { + "epoch": 1.6361362925914347, + "grad_norm": 0.27734375, + "learning_rate": 0.00012748027230724497, + "loss": 1.7076, + "step": 5270 + }, + { + "epoch": 1.636448890278212, + "grad_norm": 0.294921875, + "learning_rate": 0.0001274566468626225, + "loss": 1.4324, + "step": 5271 + }, + { + "epoch": 1.6367614879649892, + "grad_norm": 0.28125, + "learning_rate": 0.00012743301976040722, + "loss": 1.3423, + "step": 5272 + }, + { + "epoch": 1.6370740856517663, + "grad_norm": 0.275390625, + "learning_rate": 0.00012740939100202564, + "loss": 1.5177, + "step": 5273 + }, + { + "epoch": 1.6373866833385433, + "grad_norm": 0.263671875, + "learning_rate": 0.00012738576058890413, + "loss": 1.4188, + "step": 5274 + }, + { + "epoch": 1.6376992810253204, + "grad_norm": 0.271484375, + "learning_rate": 0.0001273621285224694, + "loss": 1.4759, + "step": 5275 + }, + { + "epoch": 1.6380118787120974, + "grad_norm": 0.279296875, + "learning_rate": 0.00012733849480414807, + "loss": 1.5118, + "step": 5276 + }, + { + "epoch": 1.6383244763988747, + "grad_norm": 0.294921875, + "learning_rate": 0.00012731485943536704, + "loss": 1.3826, + "step": 5277 + }, + { + "epoch": 1.6386370740856517, + "grad_norm": 0.294921875, + "learning_rate": 0.0001272912224175531, + "loss": 1.417, + "step": 5278 + }, + { + "epoch": 1.638949671772429, + "grad_norm": 0.27734375, + "learning_rate": 0.00012726758375213327, + "loss": 1.4968, + "step": 5279 + }, + { + "epoch": 1.639262269459206, + "grad_norm": 0.28125, + "learning_rate": 0.00012724394344053465, + "loss": 1.4183, + "step": 5280 + }, + { + "epoch": 1.639574867145983, + "grad_norm": 0.279296875, + "learning_rate": 0.00012722030148418448, + "loss": 1.573, + "step": 5281 + }, + { + "epoch": 1.6398874648327602, + "grad_norm": 0.267578125, + "learning_rate": 0.00012719665788451, + "loss": 1.5447, + "step": 5282 + }, + { + "epoch": 1.6402000625195372, + "grad_norm": 0.28125, + "learning_rate": 0.00012717301264293865, + "loss": 1.6017, + "step": 5283 + }, + { + "epoch": 1.6405126602063145, + "grad_norm": 0.29296875, + "learning_rate": 0.0001271493657608979, + "loss": 1.5853, + "step": 5284 + }, + { + "epoch": 1.6408252578930917, + "grad_norm": 0.2890625, + "learning_rate": 0.00012712571723981532, + "loss": 1.737, + "step": 5285 + }, + { + "epoch": 1.6411378555798688, + "grad_norm": 0.29296875, + "learning_rate": 0.00012710206708111863, + "loss": 1.4768, + "step": 5286 + }, + { + "epoch": 1.6414504532666458, + "grad_norm": 0.2734375, + "learning_rate": 0.0001270784152862356, + "loss": 1.5415, + "step": 5287 + }, + { + "epoch": 1.641763050953423, + "grad_norm": 0.28515625, + "learning_rate": 0.00012705476185659412, + "loss": 1.4512, + "step": 5288 + }, + { + "epoch": 1.6420756486402, + "grad_norm": 0.294921875, + "learning_rate": 0.00012703110679362226, + "loss": 1.4359, + "step": 5289 + }, + { + "epoch": 1.6423882463269772, + "grad_norm": 0.2890625, + "learning_rate": 0.00012700745009874799, + "loss": 2.0473, + "step": 5290 + }, + { + "epoch": 1.6427008440137543, + "grad_norm": 0.27734375, + "learning_rate": 0.00012698379177339956, + "loss": 1.7714, + "step": 5291 + }, + { + "epoch": 1.6430134417005315, + "grad_norm": 0.380859375, + "learning_rate": 0.00012696013181900522, + "loss": 2.3856, + "step": 5292 + }, + { + "epoch": 1.6433260393873086, + "grad_norm": 0.29296875, + "learning_rate": 0.00012693647023699335, + "loss": 1.3935, + "step": 5293 + }, + { + "epoch": 1.6436386370740856, + "grad_norm": 0.267578125, + "learning_rate": 0.00012691280702879247, + "loss": 1.5267, + "step": 5294 + }, + { + "epoch": 1.6439512347608627, + "grad_norm": 0.28125, + "learning_rate": 0.00012688914219583116, + "loss": 1.7312, + "step": 5295 + }, + { + "epoch": 1.6442638324476397, + "grad_norm": 0.283203125, + "learning_rate": 0.00012686547573953803, + "loss": 1.5775, + "step": 5296 + }, + { + "epoch": 1.644576430134417, + "grad_norm": 0.37890625, + "learning_rate": 0.0001268418076613419, + "loss": 1.9265, + "step": 5297 + }, + { + "epoch": 1.6448890278211943, + "grad_norm": 0.287109375, + "learning_rate": 0.00012681813796267162, + "loss": 1.5896, + "step": 5298 + }, + { + "epoch": 1.6452016255079713, + "grad_norm": 0.26953125, + "learning_rate": 0.00012679446664495622, + "loss": 1.5002, + "step": 5299 + }, + { + "epoch": 1.6455142231947484, + "grad_norm": 0.2890625, + "learning_rate": 0.00012677079370962467, + "loss": 1.3045, + "step": 5300 + }, + { + "epoch": 1.6458268208815254, + "grad_norm": 0.291015625, + "learning_rate": 0.00012674711915810626, + "loss": 1.4689, + "step": 5301 + }, + { + "epoch": 1.6461394185683025, + "grad_norm": 0.2890625, + "learning_rate": 0.00012672344299183012, + "loss": 1.4018, + "step": 5302 + }, + { + "epoch": 1.6464520162550798, + "grad_norm": 0.283203125, + "learning_rate": 0.0001266997652122257, + "loss": 1.5373, + "step": 5303 + }, + { + "epoch": 1.6467646139418568, + "grad_norm": 0.271484375, + "learning_rate": 0.0001266760858207224, + "loss": 1.5459, + "step": 5304 + }, + { + "epoch": 1.647077211628634, + "grad_norm": 0.275390625, + "learning_rate": 0.00012665240481874986, + "loss": 1.7597, + "step": 5305 + }, + { + "epoch": 1.6473898093154111, + "grad_norm": 0.28515625, + "learning_rate": 0.00012662872220773762, + "loss": 1.628, + "step": 5306 + }, + { + "epoch": 1.6477024070021882, + "grad_norm": 0.27734375, + "learning_rate": 0.00012660503798911555, + "loss": 1.5585, + "step": 5307 + }, + { + "epoch": 1.6480150046889652, + "grad_norm": 0.279296875, + "learning_rate": 0.0001265813521643134, + "loss": 1.5892, + "step": 5308 + }, + { + "epoch": 1.6483276023757423, + "grad_norm": 0.294921875, + "learning_rate": 0.00012655766473476115, + "loss": 1.54, + "step": 5309 + }, + { + "epoch": 1.6486402000625195, + "grad_norm": 0.287109375, + "learning_rate": 0.00012653397570188882, + "loss": 1.9819, + "step": 5310 + }, + { + "epoch": 1.6489527977492968, + "grad_norm": 0.275390625, + "learning_rate": 0.0001265102850671266, + "loss": 1.4561, + "step": 5311 + }, + { + "epoch": 1.6492653954360739, + "grad_norm": 0.2734375, + "learning_rate": 0.00012648659283190464, + "loss": 1.5799, + "step": 5312 + }, + { + "epoch": 1.649577993122851, + "grad_norm": 0.294921875, + "learning_rate": 0.00012646289899765338, + "loss": 1.3482, + "step": 5313 + }, + { + "epoch": 1.649890590809628, + "grad_norm": 0.291015625, + "learning_rate": 0.00012643920356580313, + "loss": 1.8221, + "step": 5314 + }, + { + "epoch": 1.650203188496405, + "grad_norm": 0.30078125, + "learning_rate": 0.00012641550653778448, + "loss": 1.4486, + "step": 5315 + }, + { + "epoch": 1.6505157861831823, + "grad_norm": 0.306640625, + "learning_rate": 0.00012639180791502804, + "loss": 1.445, + "step": 5316 + }, + { + "epoch": 1.6508283838699593, + "grad_norm": 0.283203125, + "learning_rate": 0.00012636810769896454, + "loss": 1.48, + "step": 5317 + }, + { + "epoch": 1.6511409815567366, + "grad_norm": 0.283203125, + "learning_rate": 0.00012634440589102478, + "loss": 1.4193, + "step": 5318 + }, + { + "epoch": 1.6514535792435137, + "grad_norm": 0.296875, + "learning_rate": 0.00012632070249263969, + "loss": 1.7591, + "step": 5319 + }, + { + "epoch": 1.6517661769302907, + "grad_norm": 0.287109375, + "learning_rate": 0.0001262969975052402, + "loss": 1.5557, + "step": 5320 + }, + { + "epoch": 1.6520787746170678, + "grad_norm": 0.27734375, + "learning_rate": 0.00012627329093025747, + "loss": 1.6633, + "step": 5321 + }, + { + "epoch": 1.6523913723038448, + "grad_norm": 0.275390625, + "learning_rate": 0.00012624958276912266, + "loss": 1.6311, + "step": 5322 + }, + { + "epoch": 1.652703969990622, + "grad_norm": 0.2890625, + "learning_rate": 0.00012622587302326714, + "loss": 1.6457, + "step": 5323 + }, + { + "epoch": 1.6530165676773994, + "grad_norm": 0.2734375, + "learning_rate": 0.0001262021616941222, + "loss": 1.5174, + "step": 5324 + }, + { + "epoch": 1.6533291653641764, + "grad_norm": 0.271484375, + "learning_rate": 0.00012617844878311943, + "loss": 1.922, + "step": 5325 + }, + { + "epoch": 1.6536417630509535, + "grad_norm": 0.28125, + "learning_rate": 0.0001261547342916903, + "loss": 1.6622, + "step": 5326 + }, + { + "epoch": 1.6539543607377305, + "grad_norm": 0.279296875, + "learning_rate": 0.00012613101822126654, + "loss": 1.6665, + "step": 5327 + }, + { + "epoch": 1.6542669584245075, + "grad_norm": 0.28125, + "learning_rate": 0.00012610730057327992, + "loss": 1.781, + "step": 5328 + }, + { + "epoch": 1.6545795561112848, + "grad_norm": 0.287109375, + "learning_rate": 0.00012608358134916228, + "loss": 1.7471, + "step": 5329 + }, + { + "epoch": 1.6548921537980619, + "grad_norm": 0.28515625, + "learning_rate": 0.00012605986055034562, + "loss": 1.6828, + "step": 5330 + }, + { + "epoch": 1.6552047514848391, + "grad_norm": 0.275390625, + "learning_rate": 0.00012603613817826193, + "loss": 1.7583, + "step": 5331 + }, + { + "epoch": 1.6555173491716162, + "grad_norm": 0.294921875, + "learning_rate": 0.0001260124142343434, + "loss": 1.5421, + "step": 5332 + }, + { + "epoch": 1.6558299468583932, + "grad_norm": 0.296875, + "learning_rate": 0.00012598868872002234, + "loss": 1.4766, + "step": 5333 + }, + { + "epoch": 1.6561425445451703, + "grad_norm": 0.287109375, + "learning_rate": 0.00012596496163673097, + "loss": 1.6682, + "step": 5334 + }, + { + "epoch": 1.6564551422319473, + "grad_norm": 0.287109375, + "learning_rate": 0.0001259412329859018, + "loss": 2.1212, + "step": 5335 + }, + { + "epoch": 1.6567677399187246, + "grad_norm": 0.283203125, + "learning_rate": 0.00012591750276896732, + "loss": 1.5356, + "step": 5336 + }, + { + "epoch": 1.6570803376055019, + "grad_norm": 0.287109375, + "learning_rate": 0.00012589377098736019, + "loss": 1.4372, + "step": 5337 + }, + { + "epoch": 1.657392935292279, + "grad_norm": 0.279296875, + "learning_rate": 0.0001258700376425131, + "loss": 1.4218, + "step": 5338 + }, + { + "epoch": 1.657705532979056, + "grad_norm": 0.3046875, + "learning_rate": 0.00012584630273585886, + "loss": 1.753, + "step": 5339 + }, + { + "epoch": 1.658018130665833, + "grad_norm": 0.287109375, + "learning_rate": 0.0001258225662688304, + "loss": 1.2731, + "step": 5340 + }, + { + "epoch": 1.65833072835261, + "grad_norm": 0.26953125, + "learning_rate": 0.0001257988282428607, + "loss": 1.879, + "step": 5341 + }, + { + "epoch": 1.6586433260393874, + "grad_norm": 0.28515625, + "learning_rate": 0.00012577508865938288, + "loss": 1.5619, + "step": 5342 + }, + { + "epoch": 1.6589559237261644, + "grad_norm": 0.2890625, + "learning_rate": 0.00012575134751983012, + "loss": 1.514, + "step": 5343 + }, + { + "epoch": 1.6592685214129417, + "grad_norm": 0.2890625, + "learning_rate": 0.00012572760482563566, + "loss": 1.4545, + "step": 5344 + }, + { + "epoch": 1.6595811190997187, + "grad_norm": 0.2890625, + "learning_rate": 0.00012570386057823293, + "loss": 1.4924, + "step": 5345 + }, + { + "epoch": 1.6598937167864958, + "grad_norm": 0.263671875, + "learning_rate": 0.00012568011477905538, + "loss": 1.4768, + "step": 5346 + }, + { + "epoch": 1.6602063144732728, + "grad_norm": 0.283203125, + "learning_rate": 0.00012565636742953656, + "loss": 1.6362, + "step": 5347 + }, + { + "epoch": 1.6605189121600499, + "grad_norm": 0.28515625, + "learning_rate": 0.00012563261853111018, + "loss": 1.4759, + "step": 5348 + }, + { + "epoch": 1.6608315098468271, + "grad_norm": 0.29296875, + "learning_rate": 0.00012560886808520992, + "loss": 1.7087, + "step": 5349 + }, + { + "epoch": 1.6611441075336042, + "grad_norm": 0.29296875, + "learning_rate": 0.00012558511609326968, + "loss": 1.6689, + "step": 5350 + }, + { + "epoch": 1.6614567052203815, + "grad_norm": 0.26953125, + "learning_rate": 0.00012556136255672339, + "loss": 1.6328, + "step": 5351 + }, + { + "epoch": 1.6617693029071585, + "grad_norm": 0.28515625, + "learning_rate": 0.00012553760747700503, + "loss": 1.5469, + "step": 5352 + }, + { + "epoch": 1.6620819005939356, + "grad_norm": 0.27734375, + "learning_rate": 0.00012551385085554878, + "loss": 1.9411, + "step": 5353 + }, + { + "epoch": 1.6623944982807126, + "grad_norm": 0.27734375, + "learning_rate": 0.00012549009269378886, + "loss": 1.6302, + "step": 5354 + }, + { + "epoch": 1.66270709596749, + "grad_norm": 0.294921875, + "learning_rate": 0.00012546633299315954, + "loss": 1.3778, + "step": 5355 + }, + { + "epoch": 1.663019693654267, + "grad_norm": 0.29296875, + "learning_rate": 0.00012544257175509525, + "loss": 1.6108, + "step": 5356 + }, + { + "epoch": 1.6633322913410442, + "grad_norm": 0.28125, + "learning_rate": 0.00012541880898103052, + "loss": 1.4986, + "step": 5357 + }, + { + "epoch": 1.6636448890278213, + "grad_norm": 0.2890625, + "learning_rate": 0.00012539504467239985, + "loss": 1.6511, + "step": 5358 + }, + { + "epoch": 1.6639574867145983, + "grad_norm": 0.287109375, + "learning_rate": 0.00012537127883063798, + "loss": 1.7448, + "step": 5359 + }, + { + "epoch": 1.6642700844013754, + "grad_norm": 0.31640625, + "learning_rate": 0.00012534751145717969, + "loss": 1.4229, + "step": 5360 + }, + { + "epoch": 1.6645826820881524, + "grad_norm": 0.28515625, + "learning_rate": 0.00012532374255345982, + "loss": 1.8999, + "step": 5361 + }, + { + "epoch": 1.6648952797749297, + "grad_norm": 0.27734375, + "learning_rate": 0.00012529997212091336, + "loss": 1.4283, + "step": 5362 + }, + { + "epoch": 1.6652078774617067, + "grad_norm": 0.28125, + "learning_rate": 0.00012527620016097532, + "loss": 1.5923, + "step": 5363 + }, + { + "epoch": 1.665520475148484, + "grad_norm": 0.27734375, + "learning_rate": 0.00012525242667508089, + "loss": 1.2956, + "step": 5364 + }, + { + "epoch": 1.665833072835261, + "grad_norm": 0.29296875, + "learning_rate": 0.00012522865166466528, + "loss": 1.3285, + "step": 5365 + }, + { + "epoch": 1.666145670522038, + "grad_norm": 0.2890625, + "learning_rate": 0.0001252048751311638, + "loss": 1.4168, + "step": 5366 + }, + { + "epoch": 1.6664582682088152, + "grad_norm": 0.275390625, + "learning_rate": 0.00012518109707601192, + "loss": 1.4103, + "step": 5367 + }, + { + "epoch": 1.6667708658955924, + "grad_norm": 0.28515625, + "learning_rate": 0.0001251573175006451, + "loss": 1.5798, + "step": 5368 + }, + { + "epoch": 1.6670834635823695, + "grad_norm": 0.279296875, + "learning_rate": 0.00012513353640649895, + "loss": 1.488, + "step": 5369 + }, + { + "epoch": 1.6673960612691467, + "grad_norm": 0.267578125, + "learning_rate": 0.00012510975379500917, + "loss": 1.2702, + "step": 5370 + }, + { + "epoch": 1.6677086589559238, + "grad_norm": 0.30078125, + "learning_rate": 0.00012508596966761163, + "loss": 1.5452, + "step": 5371 + }, + { + "epoch": 1.6680212566427008, + "grad_norm": 0.2734375, + "learning_rate": 0.00012506218402574202, + "loss": 1.4895, + "step": 5372 + }, + { + "epoch": 1.668333854329478, + "grad_norm": 0.279296875, + "learning_rate": 0.00012503839687083652, + "loss": 1.7254, + "step": 5373 + }, + { + "epoch": 1.668646452016255, + "grad_norm": 0.296875, + "learning_rate": 0.00012501460820433103, + "loss": 1.6656, + "step": 5374 + }, + { + "epoch": 1.6689590497030322, + "grad_norm": 0.283203125, + "learning_rate": 0.0001249908180276618, + "loss": 1.7435, + "step": 5375 + }, + { + "epoch": 1.6692716473898093, + "grad_norm": 0.2734375, + "learning_rate": 0.00012496702634226504, + "loss": 1.667, + "step": 5376 + }, + { + "epoch": 1.6695842450765865, + "grad_norm": 0.291015625, + "learning_rate": 0.00012494323314957712, + "loss": 1.7495, + "step": 5377 + }, + { + "epoch": 1.6698968427633636, + "grad_norm": 0.279296875, + "learning_rate": 0.00012491943845103438, + "loss": 1.4274, + "step": 5378 + }, + { + "epoch": 1.6702094404501406, + "grad_norm": 0.283203125, + "learning_rate": 0.00012489564224807338, + "loss": 1.6359, + "step": 5379 + }, + { + "epoch": 1.6705220381369177, + "grad_norm": 0.283203125, + "learning_rate": 0.00012487184454213073, + "loss": 1.5136, + "step": 5380 + }, + { + "epoch": 1.670834635823695, + "grad_norm": 0.298828125, + "learning_rate": 0.00012484804533464315, + "loss": 1.4721, + "step": 5381 + }, + { + "epoch": 1.671147233510472, + "grad_norm": 0.271484375, + "learning_rate": 0.0001248242446270474, + "loss": 1.6776, + "step": 5382 + }, + { + "epoch": 1.6714598311972493, + "grad_norm": 0.275390625, + "learning_rate": 0.00012480044242078043, + "loss": 1.5605, + "step": 5383 + }, + { + "epoch": 1.6717724288840263, + "grad_norm": 0.29296875, + "learning_rate": 0.00012477663871727907, + "loss": 1.3675, + "step": 5384 + }, + { + "epoch": 1.6720850265708034, + "grad_norm": 0.279296875, + "learning_rate": 0.0001247528335179805, + "loss": 1.4221, + "step": 5385 + }, + { + "epoch": 1.6723976242575804, + "grad_norm": 0.2734375, + "learning_rate": 0.00012472902682432178, + "loss": 1.4183, + "step": 5386 + }, + { + "epoch": 1.6727102219443575, + "grad_norm": 0.30078125, + "learning_rate": 0.00012470521863774023, + "loss": 1.6643, + "step": 5387 + }, + { + "epoch": 1.6730228196311347, + "grad_norm": 0.283203125, + "learning_rate": 0.00012468140895967314, + "loss": 1.5463, + "step": 5388 + }, + { + "epoch": 1.6733354173179118, + "grad_norm": 0.26953125, + "learning_rate": 0.00012465759779155798, + "loss": 1.3038, + "step": 5389 + }, + { + "epoch": 1.673648015004689, + "grad_norm": 0.27734375, + "learning_rate": 0.00012463378513483215, + "loss": 1.6816, + "step": 5390 + }, + { + "epoch": 1.6739606126914661, + "grad_norm": 0.2734375, + "learning_rate": 0.00012460997099093335, + "loss": 1.8097, + "step": 5391 + }, + { + "epoch": 1.6742732103782432, + "grad_norm": 0.28515625, + "learning_rate": 0.00012458615536129925, + "loss": 1.2823, + "step": 5392 + }, + { + "epoch": 1.6745858080650202, + "grad_norm": 0.283203125, + "learning_rate": 0.0001245623382473676, + "loss": 1.3995, + "step": 5393 + }, + { + "epoch": 1.6748984057517975, + "grad_norm": 0.275390625, + "learning_rate": 0.0001245385196505763, + "loss": 1.4217, + "step": 5394 + }, + { + "epoch": 1.6752110034385745, + "grad_norm": 0.275390625, + "learning_rate": 0.00012451469957236336, + "loss": 1.466, + "step": 5395 + }, + { + "epoch": 1.6755236011253518, + "grad_norm": 0.283203125, + "learning_rate": 0.00012449087801416668, + "loss": 1.4582, + "step": 5396 + }, + { + "epoch": 1.6758361988121289, + "grad_norm": 0.296875, + "learning_rate": 0.00012446705497742454, + "loss": 1.6292, + "step": 5397 + }, + { + "epoch": 1.676148796498906, + "grad_norm": 0.2890625, + "learning_rate": 0.00012444323046357505, + "loss": 1.6885, + "step": 5398 + }, + { + "epoch": 1.676461394185683, + "grad_norm": 0.27734375, + "learning_rate": 0.00012441940447405664, + "loss": 1.5673, + "step": 5399 + }, + { + "epoch": 1.67677399187246, + "grad_norm": 0.2734375, + "learning_rate": 0.00012439557701030763, + "loss": 1.3717, + "step": 5400 + }, + { + "epoch": 1.6770865895592373, + "grad_norm": 0.283203125, + "learning_rate": 0.00012437174807376658, + "loss": 1.873, + "step": 5401 + }, + { + "epoch": 1.6773991872460143, + "grad_norm": 0.294921875, + "learning_rate": 0.00012434791766587205, + "loss": 1.7381, + "step": 5402 + }, + { + "epoch": 1.6777117849327916, + "grad_norm": 0.287109375, + "learning_rate": 0.00012432408578806267, + "loss": 1.4546, + "step": 5403 + }, + { + "epoch": 1.6780243826195687, + "grad_norm": 0.291015625, + "learning_rate": 0.00012430025244177727, + "loss": 1.4063, + "step": 5404 + }, + { + "epoch": 1.6783369803063457, + "grad_norm": 0.27734375, + "learning_rate": 0.00012427641762845465, + "loss": 1.4917, + "step": 5405 + }, + { + "epoch": 1.6786495779931228, + "grad_norm": 0.283203125, + "learning_rate": 0.00012425258134953371, + "loss": 1.6743, + "step": 5406 + }, + { + "epoch": 1.6789621756798998, + "grad_norm": 0.2734375, + "learning_rate": 0.00012422874360645358, + "loss": 1.7126, + "step": 5407 + }, + { + "epoch": 1.679274773366677, + "grad_norm": 0.2734375, + "learning_rate": 0.00012420490440065335, + "loss": 1.4323, + "step": 5408 + }, + { + "epoch": 1.6795873710534543, + "grad_norm": 0.294921875, + "learning_rate": 0.00012418106373357217, + "loss": 1.4172, + "step": 5409 + }, + { + "epoch": 1.6798999687402314, + "grad_norm": 0.3046875, + "learning_rate": 0.00012415722160664933, + "loss": 1.8328, + "step": 5410 + }, + { + "epoch": 1.6802125664270084, + "grad_norm": 0.275390625, + "learning_rate": 0.00012413337802132424, + "loss": 1.5253, + "step": 5411 + }, + { + "epoch": 1.6805251641137855, + "grad_norm": 0.27734375, + "learning_rate": 0.00012410953297903643, + "loss": 1.4609, + "step": 5412 + }, + { + "epoch": 1.6808377618005625, + "grad_norm": 0.275390625, + "learning_rate": 0.00012408568648122531, + "loss": 1.6043, + "step": 5413 + }, + { + "epoch": 1.6811503594873398, + "grad_norm": 0.2890625, + "learning_rate": 0.00012406183852933068, + "loss": 1.4588, + "step": 5414 + }, + { + "epoch": 1.6814629571741169, + "grad_norm": 0.279296875, + "learning_rate": 0.00012403798912479216, + "loss": 1.3241, + "step": 5415 + }, + { + "epoch": 1.6817755548608941, + "grad_norm": 0.287109375, + "learning_rate": 0.00012401413826904957, + "loss": 1.2465, + "step": 5416 + }, + { + "epoch": 1.6820881525476712, + "grad_norm": 0.2734375, + "learning_rate": 0.0001239902859635429, + "loss": 1.6052, + "step": 5417 + }, + { + "epoch": 1.6824007502344482, + "grad_norm": 0.28125, + "learning_rate": 0.00012396643220971207, + "loss": 1.7932, + "step": 5418 + }, + { + "epoch": 1.6827133479212253, + "grad_norm": 0.29296875, + "learning_rate": 0.00012394257700899718, + "loss": 1.3542, + "step": 5419 + }, + { + "epoch": 1.6830259456080023, + "grad_norm": 0.271484375, + "learning_rate": 0.00012391872036283843, + "loss": 1.7026, + "step": 5420 + }, + { + "epoch": 1.6833385432947796, + "grad_norm": 0.27734375, + "learning_rate": 0.00012389486227267605, + "loss": 1.519, + "step": 5421 + }, + { + "epoch": 1.6836511409815569, + "grad_norm": 0.291015625, + "learning_rate": 0.00012387100273995036, + "loss": 1.4518, + "step": 5422 + }, + { + "epoch": 1.683963738668334, + "grad_norm": 0.306640625, + "learning_rate": 0.00012384714176610185, + "loss": 1.7116, + "step": 5423 + }, + { + "epoch": 1.684276336355111, + "grad_norm": 0.29296875, + "learning_rate": 0.00012382327935257098, + "loss": 1.656, + "step": 5424 + }, + { + "epoch": 1.684588934041888, + "grad_norm": 0.2734375, + "learning_rate": 0.00012379941550079836, + "loss": 1.7991, + "step": 5425 + }, + { + "epoch": 1.684901531728665, + "grad_norm": 0.30078125, + "learning_rate": 0.00012377555021222473, + "loss": 1.425, + "step": 5426 + }, + { + "epoch": 1.6852141294154424, + "grad_norm": 0.263671875, + "learning_rate": 0.00012375168348829085, + "loss": 1.6756, + "step": 5427 + }, + { + "epoch": 1.6855267271022194, + "grad_norm": 0.279296875, + "learning_rate": 0.00012372781533043754, + "loss": 1.5876, + "step": 5428 + }, + { + "epoch": 1.6858393247889967, + "grad_norm": 0.287109375, + "learning_rate": 0.00012370394574010577, + "loss": 1.6633, + "step": 5429 + }, + { + "epoch": 1.6861519224757737, + "grad_norm": 0.28125, + "learning_rate": 0.00012368007471873663, + "loss": 1.4921, + "step": 5430 + }, + { + "epoch": 1.6864645201625508, + "grad_norm": 0.283203125, + "learning_rate": 0.00012365620226777117, + "loss": 1.5537, + "step": 5431 + }, + { + "epoch": 1.6867771178493278, + "grad_norm": 0.291015625, + "learning_rate": 0.00012363232838865065, + "loss": 1.3548, + "step": 5432 + }, + { + "epoch": 1.6870897155361049, + "grad_norm": 0.28515625, + "learning_rate": 0.00012360845308281634, + "loss": 1.4696, + "step": 5433 + }, + { + "epoch": 1.6874023132228821, + "grad_norm": 0.287109375, + "learning_rate": 0.00012358457635170965, + "loss": 1.6547, + "step": 5434 + }, + { + "epoch": 1.6877149109096594, + "grad_norm": 0.283203125, + "learning_rate": 0.000123560698196772, + "loss": 1.4953, + "step": 5435 + }, + { + "epoch": 1.6880275085964365, + "grad_norm": 0.28515625, + "learning_rate": 0.000123536818619445, + "loss": 2.0835, + "step": 5436 + }, + { + "epoch": 1.6883401062832135, + "grad_norm": 0.2890625, + "learning_rate": 0.00012351293762117026, + "loss": 1.351, + "step": 5437 + }, + { + "epoch": 1.6886527039699906, + "grad_norm": 0.296875, + "learning_rate": 0.00012348905520338953, + "loss": 1.482, + "step": 5438 + }, + { + "epoch": 1.6889653016567676, + "grad_norm": 0.27734375, + "learning_rate": 0.0001234651713675446, + "loss": 1.6435, + "step": 5439 + }, + { + "epoch": 1.6892778993435449, + "grad_norm": 0.291015625, + "learning_rate": 0.00012344128611507734, + "loss": 1.4134, + "step": 5440 + }, + { + "epoch": 1.689590497030322, + "grad_norm": 0.2734375, + "learning_rate": 0.0001234173994474298, + "loss": 1.5871, + "step": 5441 + }, + { + "epoch": 1.6899030947170992, + "grad_norm": 0.283203125, + "learning_rate": 0.00012339351136604403, + "loss": 1.4284, + "step": 5442 + }, + { + "epoch": 1.6902156924038763, + "grad_norm": 0.41015625, + "learning_rate": 0.0001233696218723621, + "loss": 2.1214, + "step": 5443 + }, + { + "epoch": 1.6905282900906533, + "grad_norm": 0.306640625, + "learning_rate": 0.00012334573096782638, + "loss": 1.8422, + "step": 5444 + }, + { + "epoch": 1.6908408877774304, + "grad_norm": 0.28125, + "learning_rate": 0.00012332183865387908, + "loss": 1.6021, + "step": 5445 + }, + { + "epoch": 1.6911534854642074, + "grad_norm": 0.275390625, + "learning_rate": 0.00012329794493196272, + "loss": 1.5454, + "step": 5446 + }, + { + "epoch": 1.6914660831509847, + "grad_norm": 0.298828125, + "learning_rate": 0.00012327404980351972, + "loss": 1.5053, + "step": 5447 + }, + { + "epoch": 1.691778680837762, + "grad_norm": 0.29296875, + "learning_rate": 0.00012325015326999269, + "loss": 1.449, + "step": 5448 + }, + { + "epoch": 1.692091278524539, + "grad_norm": 0.28515625, + "learning_rate": 0.0001232262553328242, + "loss": 1.4623, + "step": 5449 + }, + { + "epoch": 1.692403876211316, + "grad_norm": 0.287109375, + "learning_rate": 0.00012320235599345714, + "loss": 1.6873, + "step": 5450 + }, + { + "epoch": 1.692716473898093, + "grad_norm": 0.28515625, + "learning_rate": 0.00012317845525333426, + "loss": 1.7797, + "step": 5451 + }, + { + "epoch": 1.6930290715848701, + "grad_norm": 0.29296875, + "learning_rate": 0.0001231545531138985, + "loss": 1.4857, + "step": 5452 + }, + { + "epoch": 1.6933416692716474, + "grad_norm": 0.28515625, + "learning_rate": 0.0001231306495765929, + "loss": 1.6462, + "step": 5453 + }, + { + "epoch": 1.6936542669584245, + "grad_norm": 0.283203125, + "learning_rate": 0.0001231067446428605, + "loss": 1.4372, + "step": 5454 + }, + { + "epoch": 1.6939668646452017, + "grad_norm": 0.287109375, + "learning_rate": 0.00012308283831414445, + "loss": 1.2759, + "step": 5455 + }, + { + "epoch": 1.6942794623319788, + "grad_norm": 0.265625, + "learning_rate": 0.00012305893059188805, + "loss": 1.3461, + "step": 5456 + }, + { + "epoch": 1.6945920600187558, + "grad_norm": 0.294921875, + "learning_rate": 0.0001230350214775346, + "loss": 1.544, + "step": 5457 + }, + { + "epoch": 1.694904657705533, + "grad_norm": 0.34765625, + "learning_rate": 0.0001230111109725276, + "loss": 2.1854, + "step": 5458 + }, + { + "epoch": 1.69521725539231, + "grad_norm": 0.291015625, + "learning_rate": 0.0001229871990783105, + "loss": 1.6051, + "step": 5459 + }, + { + "epoch": 1.6955298530790872, + "grad_norm": 0.283203125, + "learning_rate": 0.00012296328579632687, + "loss": 1.603, + "step": 5460 + }, + { + "epoch": 1.6958424507658645, + "grad_norm": 0.306640625, + "learning_rate": 0.0001229393711280204, + "loss": 1.6455, + "step": 5461 + }, + { + "epoch": 1.6961550484526415, + "grad_norm": 0.28125, + "learning_rate": 0.0001229154550748349, + "loss": 1.3636, + "step": 5462 + }, + { + "epoch": 1.6964676461394186, + "grad_norm": 0.28125, + "learning_rate": 0.00012289153763821417, + "loss": 1.6271, + "step": 5463 + }, + { + "epoch": 1.6967802438261956, + "grad_norm": 0.287109375, + "learning_rate": 0.00012286761881960214, + "loss": 1.7037, + "step": 5464 + }, + { + "epoch": 1.6970928415129727, + "grad_norm": 0.291015625, + "learning_rate": 0.00012284369862044283, + "loss": 1.6694, + "step": 5465 + }, + { + "epoch": 1.69740543919975, + "grad_norm": 0.294921875, + "learning_rate": 0.0001228197770421803, + "loss": 1.4603, + "step": 5466 + }, + { + "epoch": 1.697718036886527, + "grad_norm": 0.2734375, + "learning_rate": 0.00012279585408625875, + "loss": 1.4662, + "step": 5467 + }, + { + "epoch": 1.6980306345733043, + "grad_norm": 0.291015625, + "learning_rate": 0.00012277192975412247, + "loss": 1.3703, + "step": 5468 + }, + { + "epoch": 1.6983432322600813, + "grad_norm": 0.2890625, + "learning_rate": 0.00012274800404721574, + "loss": 1.5891, + "step": 5469 + }, + { + "epoch": 1.6986558299468584, + "grad_norm": 0.2890625, + "learning_rate": 0.00012272407696698305, + "loss": 1.8182, + "step": 5470 + }, + { + "epoch": 1.6989684276336354, + "grad_norm": 0.275390625, + "learning_rate": 0.00012270014851486886, + "loss": 1.6121, + "step": 5471 + }, + { + "epoch": 1.6992810253204125, + "grad_norm": 0.287109375, + "learning_rate": 0.00012267621869231778, + "loss": 1.3421, + "step": 5472 + }, + { + "epoch": 1.6995936230071897, + "grad_norm": 0.283203125, + "learning_rate": 0.00012265228750077447, + "loss": 1.5966, + "step": 5473 + }, + { + "epoch": 1.699906220693967, + "grad_norm": 0.283203125, + "learning_rate": 0.0001226283549416837, + "loss": 1.38, + "step": 5474 + }, + { + "epoch": 1.700218818380744, + "grad_norm": 0.28125, + "learning_rate": 0.00012260442101649031, + "loss": 1.7053, + "step": 5475 + }, + { + "epoch": 1.7005314160675211, + "grad_norm": 0.279296875, + "learning_rate": 0.0001225804857266392, + "loss": 1.483, + "step": 5476 + }, + { + "epoch": 1.7008440137542982, + "grad_norm": 0.3046875, + "learning_rate": 0.00012255654907357544, + "loss": 1.6787, + "step": 5477 + }, + { + "epoch": 1.7011566114410752, + "grad_norm": 0.28515625, + "learning_rate": 0.00012253261105874404, + "loss": 1.5571, + "step": 5478 + }, + { + "epoch": 1.7014692091278525, + "grad_norm": 0.28125, + "learning_rate": 0.00012250867168359016, + "loss": 1.8794, + "step": 5479 + }, + { + "epoch": 1.7017818068146295, + "grad_norm": 0.26953125, + "learning_rate": 0.00012248473094955914, + "loss": 1.4557, + "step": 5480 + }, + { + "epoch": 1.7020944045014068, + "grad_norm": 0.27734375, + "learning_rate": 0.00012246078885809624, + "loss": 1.8304, + "step": 5481 + }, + { + "epoch": 1.7024070021881839, + "grad_norm": 0.279296875, + "learning_rate": 0.00012243684541064692, + "loss": 1.7549, + "step": 5482 + }, + { + "epoch": 1.702719599874961, + "grad_norm": 0.28515625, + "learning_rate": 0.0001224129006086566, + "loss": 1.728, + "step": 5483 + }, + { + "epoch": 1.703032197561738, + "grad_norm": 0.287109375, + "learning_rate": 0.00012238895445357094, + "loss": 1.6094, + "step": 5484 + }, + { + "epoch": 1.703344795248515, + "grad_norm": 0.2890625, + "learning_rate": 0.00012236500694683556, + "loss": 1.5115, + "step": 5485 + }, + { + "epoch": 1.7036573929352923, + "grad_norm": 0.28125, + "learning_rate": 0.00012234105808989621, + "loss": 1.2878, + "step": 5486 + }, + { + "epoch": 1.7039699906220696, + "grad_norm": 0.287109375, + "learning_rate": 0.00012231710788419871, + "loss": 1.3691, + "step": 5487 + }, + { + "epoch": 1.7042825883088466, + "grad_norm": 0.296875, + "learning_rate": 0.000122293156331189, + "loss": 1.6046, + "step": 5488 + }, + { + "epoch": 1.7045951859956237, + "grad_norm": 0.283203125, + "learning_rate": 0.00012226920343231303, + "loss": 1.3087, + "step": 5489 + }, + { + "epoch": 1.7049077836824007, + "grad_norm": 0.27734375, + "learning_rate": 0.00012224524918901685, + "loss": 1.5949, + "step": 5490 + }, + { + "epoch": 1.7052203813691778, + "grad_norm": 0.2734375, + "learning_rate": 0.0001222212936027466, + "loss": 1.404, + "step": 5491 + }, + { + "epoch": 1.705532979055955, + "grad_norm": 0.2890625, + "learning_rate": 0.0001221973366749486, + "loss": 1.4464, + "step": 5492 + }, + { + "epoch": 1.705845576742732, + "grad_norm": 0.294921875, + "learning_rate": 0.00012217337840706905, + "loss": 1.6028, + "step": 5493 + }, + { + "epoch": 1.7061581744295093, + "grad_norm": 0.333984375, + "learning_rate": 0.00012214941880055443, + "loss": 1.9414, + "step": 5494 + }, + { + "epoch": 1.7064707721162864, + "grad_norm": 0.2890625, + "learning_rate": 0.00012212545785685115, + "loss": 1.6799, + "step": 5495 + }, + { + "epoch": 1.7067833698030634, + "grad_norm": 0.2734375, + "learning_rate": 0.0001221014955774058, + "loss": 1.4497, + "step": 5496 + }, + { + "epoch": 1.7070959674898405, + "grad_norm": 0.28515625, + "learning_rate": 0.00012207753196366496, + "loss": 1.3977, + "step": 5497 + }, + { + "epoch": 1.7074085651766175, + "grad_norm": 0.30078125, + "learning_rate": 0.00012205356701707541, + "loss": 1.5275, + "step": 5498 + }, + { + "epoch": 1.7077211628633948, + "grad_norm": 0.29296875, + "learning_rate": 0.00012202960073908389, + "loss": 1.6336, + "step": 5499 + }, + { + "epoch": 1.708033760550172, + "grad_norm": 0.298828125, + "learning_rate": 0.00012200563313113733, + "loss": 1.4646, + "step": 5500 + }, + { + "epoch": 1.7083463582369491, + "grad_norm": 0.294921875, + "learning_rate": 0.00012198166419468266, + "loss": 1.5693, + "step": 5501 + }, + { + "epoch": 1.7086589559237262, + "grad_norm": 0.267578125, + "learning_rate": 0.00012195769393116692, + "loss": 1.7618, + "step": 5502 + }, + { + "epoch": 1.7089715536105032, + "grad_norm": 0.271484375, + "learning_rate": 0.00012193372234203718, + "loss": 1.634, + "step": 5503 + }, + { + "epoch": 1.7092841512972803, + "grad_norm": 0.330078125, + "learning_rate": 0.00012190974942874069, + "loss": 2.2696, + "step": 5504 + }, + { + "epoch": 1.7095967489840576, + "grad_norm": 0.287109375, + "learning_rate": 0.00012188577519272468, + "loss": 1.6976, + "step": 5505 + }, + { + "epoch": 1.7099093466708346, + "grad_norm": 0.28125, + "learning_rate": 0.00012186179963543655, + "loss": 1.6067, + "step": 5506 + }, + { + "epoch": 1.7102219443576119, + "grad_norm": 0.28125, + "learning_rate": 0.00012183782275832374, + "loss": 1.4573, + "step": 5507 + }, + { + "epoch": 1.710534542044389, + "grad_norm": 0.287109375, + "learning_rate": 0.00012181384456283374, + "loss": 1.6158, + "step": 5508 + }, + { + "epoch": 1.710847139731166, + "grad_norm": 0.3046875, + "learning_rate": 0.00012178986505041412, + "loss": 1.4864, + "step": 5509 + }, + { + "epoch": 1.711159737417943, + "grad_norm": 0.275390625, + "learning_rate": 0.0001217658842225126, + "loss": 1.45, + "step": 5510 + }, + { + "epoch": 1.71147233510472, + "grad_norm": 0.33203125, + "learning_rate": 0.00012174190208057687, + "loss": 2.2129, + "step": 5511 + }, + { + "epoch": 1.7117849327914973, + "grad_norm": 0.294921875, + "learning_rate": 0.00012171791862605488, + "loss": 1.7555, + "step": 5512 + }, + { + "epoch": 1.7120975304782746, + "grad_norm": 0.3359375, + "learning_rate": 0.0001216939338603944, + "loss": 2.254, + "step": 5513 + }, + { + "epoch": 1.7124101281650517, + "grad_norm": 0.27734375, + "learning_rate": 0.00012166994778504355, + "loss": 1.5429, + "step": 5514 + }, + { + "epoch": 1.7127227258518287, + "grad_norm": 0.275390625, + "learning_rate": 0.0001216459604014503, + "loss": 1.6032, + "step": 5515 + }, + { + "epoch": 1.7130353235386058, + "grad_norm": 0.29296875, + "learning_rate": 0.00012162197171106282, + "loss": 1.3861, + "step": 5516 + }, + { + "epoch": 1.7133479212253828, + "grad_norm": 0.26953125, + "learning_rate": 0.00012159798171532937, + "loss": 1.5142, + "step": 5517 + }, + { + "epoch": 1.71366051891216, + "grad_norm": 0.287109375, + "learning_rate": 0.00012157399041569826, + "loss": 1.4573, + "step": 5518 + }, + { + "epoch": 1.7139731165989371, + "grad_norm": 0.302734375, + "learning_rate": 0.00012154999781361782, + "loss": 1.5277, + "step": 5519 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.29296875, + "learning_rate": 0.0001215260039105366, + "loss": 1.747, + "step": 5520 + }, + { + "epoch": 1.7145983119724915, + "grad_norm": 0.267578125, + "learning_rate": 0.00012150200870790305, + "loss": 1.3299, + "step": 5521 + }, + { + "epoch": 1.7149109096592685, + "grad_norm": 0.2890625, + "learning_rate": 0.00012147801220716585, + "loss": 1.4527, + "step": 5522 + }, + { + "epoch": 1.7152235073460456, + "grad_norm": 0.271484375, + "learning_rate": 0.00012145401440977366, + "loss": 1.3729, + "step": 5523 + }, + { + "epoch": 1.7155361050328226, + "grad_norm": 0.28515625, + "learning_rate": 0.00012143001531717535, + "loss": 1.4216, + "step": 5524 + }, + { + "epoch": 1.7158487027195999, + "grad_norm": 0.3046875, + "learning_rate": 0.00012140601493081964, + "loss": 1.5151, + "step": 5525 + }, + { + "epoch": 1.7161613004063772, + "grad_norm": 0.2734375, + "learning_rate": 0.00012138201325215557, + "loss": 1.671, + "step": 5526 + }, + { + "epoch": 1.7164738980931542, + "grad_norm": 0.287109375, + "learning_rate": 0.00012135801028263207, + "loss": 1.379, + "step": 5527 + }, + { + "epoch": 1.7167864957799313, + "grad_norm": 0.28125, + "learning_rate": 0.0001213340060236983, + "loss": 1.6137, + "step": 5528 + }, + { + "epoch": 1.7170990934667083, + "grad_norm": 0.263671875, + "learning_rate": 0.00012131000047680339, + "loss": 1.4956, + "step": 5529 + }, + { + "epoch": 1.7174116911534854, + "grad_norm": 0.291015625, + "learning_rate": 0.00012128599364339664, + "loss": 1.5429, + "step": 5530 + }, + { + "epoch": 1.7177242888402626, + "grad_norm": 0.287109375, + "learning_rate": 0.00012126198552492727, + "loss": 1.4727, + "step": 5531 + }, + { + "epoch": 1.7180368865270397, + "grad_norm": 0.2890625, + "learning_rate": 0.00012123797612284475, + "loss": 1.5302, + "step": 5532 + }, + { + "epoch": 1.718349484213817, + "grad_norm": 0.27734375, + "learning_rate": 0.00012121396543859855, + "loss": 1.7077, + "step": 5533 + }, + { + "epoch": 1.718662081900594, + "grad_norm": 0.302734375, + "learning_rate": 0.00012118995347363824, + "loss": 1.517, + "step": 5534 + }, + { + "epoch": 1.718974679587371, + "grad_norm": 0.2890625, + "learning_rate": 0.0001211659402294134, + "loss": 1.7013, + "step": 5535 + }, + { + "epoch": 1.719287277274148, + "grad_norm": 0.279296875, + "learning_rate": 0.00012114192570737383, + "loss": 1.4074, + "step": 5536 + }, + { + "epoch": 1.7195998749609251, + "grad_norm": 0.287109375, + "learning_rate": 0.00012111790990896923, + "loss": 1.7123, + "step": 5537 + }, + { + "epoch": 1.7199124726477024, + "grad_norm": 0.30859375, + "learning_rate": 0.00012109389283564952, + "loss": 1.513, + "step": 5538 + }, + { + "epoch": 1.7202250703344797, + "grad_norm": 0.2890625, + "learning_rate": 0.0001210698744888646, + "loss": 1.7622, + "step": 5539 + }, + { + "epoch": 1.7205376680212567, + "grad_norm": 0.2890625, + "learning_rate": 0.0001210458548700645, + "loss": 1.5918, + "step": 5540 + }, + { + "epoch": 1.7208502657080338, + "grad_norm": 0.294921875, + "learning_rate": 0.00012102183398069932, + "loss": 1.4455, + "step": 5541 + }, + { + "epoch": 1.7211628633948108, + "grad_norm": 0.29296875, + "learning_rate": 0.00012099781182221929, + "loss": 1.3134, + "step": 5542 + }, + { + "epoch": 1.7214754610815879, + "grad_norm": 0.291015625, + "learning_rate": 0.00012097378839607453, + "loss": 1.68, + "step": 5543 + }, + { + "epoch": 1.7217880587683652, + "grad_norm": 0.287109375, + "learning_rate": 0.00012094976370371548, + "loss": 1.5016, + "step": 5544 + }, + { + "epoch": 1.7221006564551422, + "grad_norm": 0.27734375, + "learning_rate": 0.00012092573774659248, + "loss": 1.6668, + "step": 5545 + }, + { + "epoch": 1.7224132541419195, + "grad_norm": 0.283203125, + "learning_rate": 0.00012090171052615605, + "loss": 1.3413, + "step": 5546 + }, + { + "epoch": 1.7227258518286965, + "grad_norm": 0.275390625, + "learning_rate": 0.00012087768204385668, + "loss": 1.7143, + "step": 5547 + }, + { + "epoch": 1.7230384495154736, + "grad_norm": 0.287109375, + "learning_rate": 0.0001208536523011451, + "loss": 1.3945, + "step": 5548 + }, + { + "epoch": 1.7233510472022506, + "grad_norm": 0.291015625, + "learning_rate": 0.0001208296212994719, + "loss": 1.4571, + "step": 5549 + }, + { + "epoch": 1.7236636448890277, + "grad_norm": 0.2890625, + "learning_rate": 0.00012080558904028796, + "loss": 1.7265, + "step": 5550 + }, + { + "epoch": 1.723976242575805, + "grad_norm": 0.291015625, + "learning_rate": 0.00012078155552504404, + "loss": 1.2998, + "step": 5551 + }, + { + "epoch": 1.7242888402625822, + "grad_norm": 0.294921875, + "learning_rate": 0.00012075752075519118, + "loss": 1.5475, + "step": 5552 + }, + { + "epoch": 1.7246014379493593, + "grad_norm": 0.2890625, + "learning_rate": 0.00012073348473218031, + "loss": 1.5446, + "step": 5553 + }, + { + "epoch": 1.7249140356361363, + "grad_norm": 0.30078125, + "learning_rate": 0.0001207094474574626, + "loss": 1.8914, + "step": 5554 + }, + { + "epoch": 1.7252266333229134, + "grad_norm": 0.283203125, + "learning_rate": 0.0001206854089324891, + "loss": 1.8127, + "step": 5555 + }, + { + "epoch": 1.7255392310096904, + "grad_norm": 0.287109375, + "learning_rate": 0.00012066136915871113, + "loss": 1.435, + "step": 5556 + }, + { + "epoch": 1.7258518286964677, + "grad_norm": 0.294921875, + "learning_rate": 0.00012063732813757995, + "loss": 1.6434, + "step": 5557 + }, + { + "epoch": 1.7261644263832447, + "grad_norm": 0.294921875, + "learning_rate": 0.00012061328587054701, + "loss": 1.4652, + "step": 5558 + }, + { + "epoch": 1.726477024070022, + "grad_norm": 0.2734375, + "learning_rate": 0.0001205892423590637, + "loss": 1.3189, + "step": 5559 + }, + { + "epoch": 1.726789621756799, + "grad_norm": 0.279296875, + "learning_rate": 0.00012056519760458163, + "loss": 1.3971, + "step": 5560 + }, + { + "epoch": 1.7271022194435761, + "grad_norm": 0.294921875, + "learning_rate": 0.00012054115160855233, + "loss": 1.2699, + "step": 5561 + }, + { + "epoch": 1.7274148171303532, + "grad_norm": 0.275390625, + "learning_rate": 0.00012051710437242756, + "loss": 1.3872, + "step": 5562 + }, + { + "epoch": 1.7277274148171302, + "grad_norm": 0.283203125, + "learning_rate": 0.00012049305589765905, + "loss": 1.6135, + "step": 5563 + }, + { + "epoch": 1.7280400125039075, + "grad_norm": 0.2890625, + "learning_rate": 0.00012046900618569863, + "loss": 1.5243, + "step": 5564 + }, + { + "epoch": 1.7283526101906845, + "grad_norm": 0.302734375, + "learning_rate": 0.00012044495523799824, + "loss": 1.4615, + "step": 5565 + }, + { + "epoch": 1.7286652078774618, + "grad_norm": 0.27734375, + "learning_rate": 0.00012042090305600984, + "loss": 1.6188, + "step": 5566 + }, + { + "epoch": 1.7289778055642389, + "grad_norm": 0.294921875, + "learning_rate": 0.00012039684964118547, + "loss": 1.6997, + "step": 5567 + }, + { + "epoch": 1.729290403251016, + "grad_norm": 0.283203125, + "learning_rate": 0.00012037279499497734, + "loss": 1.6044, + "step": 5568 + }, + { + "epoch": 1.729603000937793, + "grad_norm": 0.294921875, + "learning_rate": 0.00012034873911883759, + "loss": 1.4182, + "step": 5569 + }, + { + "epoch": 1.7299155986245702, + "grad_norm": 0.298828125, + "learning_rate": 0.00012032468201421853, + "loss": 1.4877, + "step": 5570 + }, + { + "epoch": 1.7302281963113473, + "grad_norm": 0.279296875, + "learning_rate": 0.00012030062368257253, + "loss": 1.6926, + "step": 5571 + }, + { + "epoch": 1.7305407939981245, + "grad_norm": 0.275390625, + "learning_rate": 0.00012027656412535202, + "loss": 1.8898, + "step": 5572 + }, + { + "epoch": 1.7308533916849016, + "grad_norm": 0.287109375, + "learning_rate": 0.00012025250334400946, + "loss": 1.5818, + "step": 5573 + }, + { + "epoch": 1.7311659893716786, + "grad_norm": 0.310546875, + "learning_rate": 0.0001202284413399975, + "loss": 1.716, + "step": 5574 + }, + { + "epoch": 1.7314785870584557, + "grad_norm": 0.2734375, + "learning_rate": 0.00012020437811476872, + "loss": 1.6398, + "step": 5575 + }, + { + "epoch": 1.7317911847452327, + "grad_norm": 0.27734375, + "learning_rate": 0.00012018031366977591, + "loss": 1.4197, + "step": 5576 + }, + { + "epoch": 1.73210378243201, + "grad_norm": 0.29296875, + "learning_rate": 0.00012015624800647185, + "loss": 1.7078, + "step": 5577 + }, + { + "epoch": 1.732416380118787, + "grad_norm": 0.275390625, + "learning_rate": 0.00012013218112630942, + "loss": 1.6071, + "step": 5578 + }, + { + "epoch": 1.7327289778055643, + "grad_norm": 0.283203125, + "learning_rate": 0.00012010811303074154, + "loss": 1.4001, + "step": 5579 + }, + { + "epoch": 1.7330415754923414, + "grad_norm": 0.287109375, + "learning_rate": 0.00012008404372122126, + "loss": 1.4807, + "step": 5580 + }, + { + "epoch": 1.7333541731791184, + "grad_norm": 0.29296875, + "learning_rate": 0.00012005997319920166, + "loss": 1.4388, + "step": 5581 + }, + { + "epoch": 1.7336667708658955, + "grad_norm": 0.2734375, + "learning_rate": 0.00012003590146613592, + "loss": 1.571, + "step": 5582 + }, + { + "epoch": 1.7339793685526728, + "grad_norm": 0.2578125, + "learning_rate": 0.0001200118285234773, + "loss": 1.3583, + "step": 5583 + }, + { + "epoch": 1.7342919662394498, + "grad_norm": 0.275390625, + "learning_rate": 0.00011998775437267906, + "loss": 1.5505, + "step": 5584 + }, + { + "epoch": 1.734604563926227, + "grad_norm": 0.279296875, + "learning_rate": 0.00011996367901519463, + "loss": 1.297, + "step": 5585 + }, + { + "epoch": 1.7349171616130041, + "grad_norm": 0.314453125, + "learning_rate": 0.00011993960245247746, + "loss": 1.6469, + "step": 5586 + }, + { + "epoch": 1.7352297592997812, + "grad_norm": 0.287109375, + "learning_rate": 0.00011991552468598106, + "loss": 1.4381, + "step": 5587 + }, + { + "epoch": 1.7355423569865582, + "grad_norm": 0.314453125, + "learning_rate": 0.00011989144571715908, + "loss": 1.4866, + "step": 5588 + }, + { + "epoch": 1.7358549546733353, + "grad_norm": 0.275390625, + "learning_rate": 0.00011986736554746518, + "loss": 1.5193, + "step": 5589 + }, + { + "epoch": 1.7361675523601126, + "grad_norm": 0.291015625, + "learning_rate": 0.00011984328417835309, + "loss": 1.3002, + "step": 5590 + }, + { + "epoch": 1.7364801500468896, + "grad_norm": 0.28125, + "learning_rate": 0.00011981920161127665, + "loss": 1.2616, + "step": 5591 + }, + { + "epoch": 1.7367927477336669, + "grad_norm": 0.287109375, + "learning_rate": 0.00011979511784768975, + "loss": 1.5705, + "step": 5592 + }, + { + "epoch": 1.737105345420444, + "grad_norm": 0.28125, + "learning_rate": 0.00011977103288904635, + "loss": 1.627, + "step": 5593 + }, + { + "epoch": 1.737417943107221, + "grad_norm": 0.283203125, + "learning_rate": 0.00011974694673680052, + "loss": 1.4498, + "step": 5594 + }, + { + "epoch": 1.737730540793998, + "grad_norm": 0.29296875, + "learning_rate": 0.00011972285939240635, + "loss": 1.4407, + "step": 5595 + }, + { + "epoch": 1.7380431384807753, + "grad_norm": 0.279296875, + "learning_rate": 0.00011969877085731802, + "loss": 1.3994, + "step": 5596 + }, + { + "epoch": 1.7383557361675523, + "grad_norm": 0.296875, + "learning_rate": 0.00011967468113298978, + "loss": 1.4255, + "step": 5597 + }, + { + "epoch": 1.7386683338543296, + "grad_norm": 0.28515625, + "learning_rate": 0.00011965059022087599, + "loss": 1.3308, + "step": 5598 + }, + { + "epoch": 1.7389809315411067, + "grad_norm": 0.294921875, + "learning_rate": 0.00011962649812243101, + "loss": 1.576, + "step": 5599 + }, + { + "epoch": 1.7392935292278837, + "grad_norm": 0.291015625, + "learning_rate": 0.00011960240483910934, + "loss": 1.5344, + "step": 5600 + }, + { + "epoch": 1.7396061269146608, + "grad_norm": 0.302734375, + "learning_rate": 0.0001195783103723655, + "loss": 1.3961, + "step": 5601 + }, + { + "epoch": 1.7399187246014378, + "grad_norm": 0.283203125, + "learning_rate": 0.00011955421472365413, + "loss": 1.4467, + "step": 5602 + }, + { + "epoch": 1.740231322288215, + "grad_norm": 0.2890625, + "learning_rate": 0.00011953011789442987, + "loss": 1.5196, + "step": 5603 + }, + { + "epoch": 1.7405439199749921, + "grad_norm": 0.283203125, + "learning_rate": 0.00011950601988614754, + "loss": 1.8276, + "step": 5604 + }, + { + "epoch": 1.7408565176617694, + "grad_norm": 0.29296875, + "learning_rate": 0.0001194819207002619, + "loss": 1.4798, + "step": 5605 + }, + { + "epoch": 1.7411691153485465, + "grad_norm": 0.28515625, + "learning_rate": 0.0001194578203382279, + "loss": 1.6241, + "step": 5606 + }, + { + "epoch": 1.7414817130353235, + "grad_norm": 0.27734375, + "learning_rate": 0.00011943371880150049, + "loss": 1.336, + "step": 5607 + }, + { + "epoch": 1.7417943107221006, + "grad_norm": 0.27734375, + "learning_rate": 0.00011940961609153472, + "loss": 1.5331, + "step": 5608 + }, + { + "epoch": 1.7421069084088776, + "grad_norm": 0.287109375, + "learning_rate": 0.00011938551220978566, + "loss": 1.4683, + "step": 5609 + }, + { + "epoch": 1.7424195060956549, + "grad_norm": 0.291015625, + "learning_rate": 0.00011936140715770856, + "loss": 1.5798, + "step": 5610 + }, + { + "epoch": 1.7427321037824322, + "grad_norm": 0.27734375, + "learning_rate": 0.00011933730093675861, + "loss": 1.7233, + "step": 5611 + }, + { + "epoch": 1.7430447014692092, + "grad_norm": 0.310546875, + "learning_rate": 0.00011931319354839118, + "loss": 1.5721, + "step": 5612 + }, + { + "epoch": 1.7433572991559863, + "grad_norm": 0.28515625, + "learning_rate": 0.00011928908499406164, + "loss": 1.7042, + "step": 5613 + }, + { + "epoch": 1.7436698968427633, + "grad_norm": 0.30078125, + "learning_rate": 0.00011926497527522546, + "loss": 1.662, + "step": 5614 + }, + { + "epoch": 1.7439824945295404, + "grad_norm": 0.2890625, + "learning_rate": 0.00011924086439333817, + "loss": 1.6444, + "step": 5615 + }, + { + "epoch": 1.7442950922163176, + "grad_norm": 0.2890625, + "learning_rate": 0.0001192167523498554, + "loss": 1.5565, + "step": 5616 + }, + { + "epoch": 1.7446076899030947, + "grad_norm": 0.28515625, + "learning_rate": 0.00011919263914623276, + "loss": 1.6223, + "step": 5617 + }, + { + "epoch": 1.744920287589872, + "grad_norm": 0.287109375, + "learning_rate": 0.00011916852478392607, + "loss": 1.8656, + "step": 5618 + }, + { + "epoch": 1.745232885276649, + "grad_norm": 0.29296875, + "learning_rate": 0.0001191444092643911, + "loss": 1.4532, + "step": 5619 + }, + { + "epoch": 1.745545482963426, + "grad_norm": 0.291015625, + "learning_rate": 0.00011912029258908372, + "loss": 1.5138, + "step": 5620 + }, + { + "epoch": 1.745858080650203, + "grad_norm": 0.283203125, + "learning_rate": 0.00011909617475945995, + "loss": 1.8386, + "step": 5621 + }, + { + "epoch": 1.7461706783369801, + "grad_norm": 0.27734375, + "learning_rate": 0.00011907205577697577, + "loss": 1.6579, + "step": 5622 + }, + { + "epoch": 1.7464832760237574, + "grad_norm": 0.287109375, + "learning_rate": 0.00011904793564308727, + "loss": 1.4472, + "step": 5623 + }, + { + "epoch": 1.7467958737105347, + "grad_norm": 0.27734375, + "learning_rate": 0.00011902381435925064, + "loss": 1.7691, + "step": 5624 + }, + { + "epoch": 1.7471084713973117, + "grad_norm": 0.287109375, + "learning_rate": 0.00011899969192692207, + "loss": 1.7234, + "step": 5625 + }, + { + "epoch": 1.7474210690840888, + "grad_norm": 0.2890625, + "learning_rate": 0.00011897556834755788, + "loss": 1.8648, + "step": 5626 + }, + { + "epoch": 1.7477336667708658, + "grad_norm": 0.296875, + "learning_rate": 0.00011895144362261448, + "loss": 1.605, + "step": 5627 + }, + { + "epoch": 1.7480462644576429, + "grad_norm": 0.283203125, + "learning_rate": 0.00011892731775354827, + "loss": 1.3145, + "step": 5628 + }, + { + "epoch": 1.7483588621444202, + "grad_norm": 0.326171875, + "learning_rate": 0.00011890319074181576, + "loss": 1.5106, + "step": 5629 + }, + { + "epoch": 1.7486714598311972, + "grad_norm": 0.28125, + "learning_rate": 0.00011887906258887351, + "loss": 1.7029, + "step": 5630 + }, + { + "epoch": 1.7489840575179745, + "grad_norm": 0.28125, + "learning_rate": 0.00011885493329617825, + "loss": 1.7823, + "step": 5631 + }, + { + "epoch": 1.7492966552047515, + "grad_norm": 0.2890625, + "learning_rate": 0.00011883080286518659, + "loss": 1.4526, + "step": 5632 + }, + { + "epoch": 1.7496092528915286, + "grad_norm": 0.271484375, + "learning_rate": 0.00011880667129735539, + "loss": 1.6187, + "step": 5633 + }, + { + "epoch": 1.7499218505783056, + "grad_norm": 0.28515625, + "learning_rate": 0.00011878253859414148, + "loss": 1.6686, + "step": 5634 + }, + { + "epoch": 1.7502344482650827, + "grad_norm": 0.275390625, + "learning_rate": 0.00011875840475700175, + "loss": 1.4003, + "step": 5635 + }, + { + "epoch": 1.75054704595186, + "grad_norm": 0.279296875, + "learning_rate": 0.00011873426978739326, + "loss": 1.7285, + "step": 5636 + }, + { + "epoch": 1.7508596436386372, + "grad_norm": 0.27734375, + "learning_rate": 0.00011871013368677302, + "loss": 1.4314, + "step": 5637 + }, + { + "epoch": 1.7511722413254143, + "grad_norm": 0.291015625, + "learning_rate": 0.00011868599645659815, + "loss": 1.7216, + "step": 5638 + }, + { + "epoch": 1.7514848390121913, + "grad_norm": 0.291015625, + "learning_rate": 0.00011866185809832588, + "loss": 1.6379, + "step": 5639 + }, + { + "epoch": 1.7517974366989684, + "grad_norm": 0.29296875, + "learning_rate": 0.00011863771861341347, + "loss": 1.8494, + "step": 5640 + }, + { + "epoch": 1.7521100343857454, + "grad_norm": 0.271484375, + "learning_rate": 0.00011861357800331818, + "loss": 1.6464, + "step": 5641 + }, + { + "epoch": 1.7524226320725227, + "grad_norm": 0.287109375, + "learning_rate": 0.00011858943626949752, + "loss": 1.4425, + "step": 5642 + }, + { + "epoch": 1.7527352297592997, + "grad_norm": 0.275390625, + "learning_rate": 0.0001185652934134089, + "loss": 1.258, + "step": 5643 + }, + { + "epoch": 1.753047827446077, + "grad_norm": 0.298828125, + "learning_rate": 0.00011854114943650983, + "loss": 1.5252, + "step": 5644 + }, + { + "epoch": 1.753360425132854, + "grad_norm": 0.283203125, + "learning_rate": 0.00011851700434025795, + "loss": 1.5082, + "step": 5645 + }, + { + "epoch": 1.753673022819631, + "grad_norm": 0.2890625, + "learning_rate": 0.00011849285812611093, + "loss": 1.2727, + "step": 5646 + }, + { + "epoch": 1.7539856205064082, + "grad_norm": 0.29296875, + "learning_rate": 0.00011846871079552649, + "loss": 1.5176, + "step": 5647 + }, + { + "epoch": 1.7542982181931852, + "grad_norm": 0.28515625, + "learning_rate": 0.00011844456234996246, + "loss": 1.5197, + "step": 5648 + }, + { + "epoch": 1.7546108158799625, + "grad_norm": 0.287109375, + "learning_rate": 0.0001184204127908767, + "loss": 1.4711, + "step": 5649 + }, + { + "epoch": 1.7549234135667398, + "grad_norm": 0.29296875, + "learning_rate": 0.00011839626211972712, + "loss": 1.5836, + "step": 5650 + }, + { + "epoch": 1.7552360112535168, + "grad_norm": 0.291015625, + "learning_rate": 0.00011837211033797175, + "loss": 1.4072, + "step": 5651 + }, + { + "epoch": 1.7555486089402939, + "grad_norm": 0.28125, + "learning_rate": 0.0001183479574470687, + "loss": 1.6971, + "step": 5652 + }, + { + "epoch": 1.755861206627071, + "grad_norm": 0.283203125, + "learning_rate": 0.00011832380344847604, + "loss": 1.5817, + "step": 5653 + }, + { + "epoch": 1.756173804313848, + "grad_norm": 0.2890625, + "learning_rate": 0.00011829964834365205, + "loss": 1.5695, + "step": 5654 + }, + { + "epoch": 1.7564864020006252, + "grad_norm": 0.28515625, + "learning_rate": 0.00011827549213405497, + "loss": 1.5927, + "step": 5655 + }, + { + "epoch": 1.7567989996874023, + "grad_norm": 0.265625, + "learning_rate": 0.00011825133482114312, + "loss": 1.3896, + "step": 5656 + }, + { + "epoch": 1.7571115973741795, + "grad_norm": 0.279296875, + "learning_rate": 0.00011822717640637493, + "loss": 1.6788, + "step": 5657 + }, + { + "epoch": 1.7574241950609566, + "grad_norm": 0.275390625, + "learning_rate": 0.00011820301689120887, + "loss": 1.2537, + "step": 5658 + }, + { + "epoch": 1.7577367927477336, + "grad_norm": 0.287109375, + "learning_rate": 0.0001181788562771035, + "loss": 1.4692, + "step": 5659 + }, + { + "epoch": 1.7580493904345107, + "grad_norm": 0.29296875, + "learning_rate": 0.00011815469456551743, + "loss": 1.6666, + "step": 5660 + }, + { + "epoch": 1.7583619881212877, + "grad_norm": 0.28125, + "learning_rate": 0.0001181305317579093, + "loss": 1.69, + "step": 5661 + }, + { + "epoch": 1.758674585808065, + "grad_norm": 0.2734375, + "learning_rate": 0.00011810636785573784, + "loss": 1.5877, + "step": 5662 + }, + { + "epoch": 1.7589871834948423, + "grad_norm": 0.28515625, + "learning_rate": 0.0001180822028604619, + "loss": 1.3923, + "step": 5663 + }, + { + "epoch": 1.7592997811816193, + "grad_norm": 0.283203125, + "learning_rate": 0.00011805803677354031, + "loss": 1.6914, + "step": 5664 + }, + { + "epoch": 1.7596123788683964, + "grad_norm": 0.267578125, + "learning_rate": 0.00011803386959643205, + "loss": 1.4907, + "step": 5665 + }, + { + "epoch": 1.7599249765551734, + "grad_norm": 0.279296875, + "learning_rate": 0.0001180097013305961, + "loss": 1.583, + "step": 5666 + }, + { + "epoch": 1.7602375742419505, + "grad_norm": 0.2890625, + "learning_rate": 0.00011798553197749152, + "loss": 1.4735, + "step": 5667 + }, + { + "epoch": 1.7605501719287278, + "grad_norm": 0.28125, + "learning_rate": 0.00011796136153857744, + "loss": 1.7344, + "step": 5668 + }, + { + "epoch": 1.7608627696155048, + "grad_norm": 0.287109375, + "learning_rate": 0.00011793719001531311, + "loss": 1.4779, + "step": 5669 + }, + { + "epoch": 1.761175367302282, + "grad_norm": 0.265625, + "learning_rate": 0.00011791301740915774, + "loss": 1.5088, + "step": 5670 + }, + { + "epoch": 1.7614879649890591, + "grad_norm": 0.287109375, + "learning_rate": 0.00011788884372157072, + "loss": 1.4494, + "step": 5671 + }, + { + "epoch": 1.7618005626758362, + "grad_norm": 0.296875, + "learning_rate": 0.00011786466895401136, + "loss": 1.5749, + "step": 5672 + }, + { + "epoch": 1.7621131603626132, + "grad_norm": 0.28515625, + "learning_rate": 0.00011784049310793919, + "loss": 1.3202, + "step": 5673 + }, + { + "epoch": 1.7624257580493903, + "grad_norm": 0.283203125, + "learning_rate": 0.00011781631618481371, + "loss": 1.4288, + "step": 5674 + }, + { + "epoch": 1.7627383557361676, + "grad_norm": 0.302734375, + "learning_rate": 0.00011779213818609451, + "loss": 1.4675, + "step": 5675 + }, + { + "epoch": 1.7630509534229448, + "grad_norm": 0.287109375, + "learning_rate": 0.00011776795911324127, + "loss": 1.6259, + "step": 5676 + }, + { + "epoch": 1.7633635511097219, + "grad_norm": 0.28515625, + "learning_rate": 0.00011774377896771371, + "loss": 1.7223, + "step": 5677 + }, + { + "epoch": 1.763676148796499, + "grad_norm": 0.2890625, + "learning_rate": 0.0001177195977509716, + "loss": 1.5379, + "step": 5678 + }, + { + "epoch": 1.763988746483276, + "grad_norm": 0.27734375, + "learning_rate": 0.00011769541546447478, + "loss": 1.4777, + "step": 5679 + }, + { + "epoch": 1.764301344170053, + "grad_norm": 0.28125, + "learning_rate": 0.00011767123210968316, + "loss": 1.8492, + "step": 5680 + }, + { + "epoch": 1.7646139418568303, + "grad_norm": 0.26953125, + "learning_rate": 0.00011764704768805677, + "loss": 1.3462, + "step": 5681 + }, + { + "epoch": 1.7649265395436073, + "grad_norm": 0.287109375, + "learning_rate": 0.00011762286220105558, + "loss": 1.5042, + "step": 5682 + }, + { + "epoch": 1.7652391372303846, + "grad_norm": 0.298828125, + "learning_rate": 0.0001175986756501398, + "loss": 1.7829, + "step": 5683 + }, + { + "epoch": 1.7655517349171617, + "grad_norm": 0.283203125, + "learning_rate": 0.00011757448803676951, + "loss": 1.6488, + "step": 5684 + }, + { + "epoch": 1.7658643326039387, + "grad_norm": 0.28125, + "learning_rate": 0.00011755029936240501, + "loss": 1.5213, + "step": 5685 + }, + { + "epoch": 1.7661769302907158, + "grad_norm": 0.287109375, + "learning_rate": 0.00011752610962850652, + "loss": 1.7373, + "step": 5686 + }, + { + "epoch": 1.7664895279774928, + "grad_norm": 0.294921875, + "learning_rate": 0.0001175019188365345, + "loss": 1.5355, + "step": 5687 + }, + { + "epoch": 1.76680212566427, + "grad_norm": 0.28125, + "learning_rate": 0.00011747772698794932, + "loss": 1.3043, + "step": 5688 + }, + { + "epoch": 1.7671147233510474, + "grad_norm": 0.298828125, + "learning_rate": 0.00011745353408421154, + "loss": 1.5752, + "step": 5689 + }, + { + "epoch": 1.7674273210378244, + "grad_norm": 0.279296875, + "learning_rate": 0.0001174293401267816, + "loss": 1.5955, + "step": 5690 + }, + { + "epoch": 1.7677399187246015, + "grad_norm": 0.26171875, + "learning_rate": 0.00011740514511712022, + "loss": 1.3573, + "step": 5691 + }, + { + "epoch": 1.7680525164113785, + "grad_norm": 0.28515625, + "learning_rate": 0.00011738094905668802, + "loss": 1.5345, + "step": 5692 + }, + { + "epoch": 1.7683651140981556, + "grad_norm": 0.287109375, + "learning_rate": 0.0001173567519469458, + "loss": 1.5061, + "step": 5693 + }, + { + "epoch": 1.7686777117849328, + "grad_norm": 0.296875, + "learning_rate": 0.00011733255378935432, + "loss": 1.4444, + "step": 5694 + }, + { + "epoch": 1.7689903094717099, + "grad_norm": 0.287109375, + "learning_rate": 0.00011730835458537455, + "loss": 1.524, + "step": 5695 + }, + { + "epoch": 1.7693029071584871, + "grad_norm": 0.287109375, + "learning_rate": 0.00011728415433646728, + "loss": 1.5181, + "step": 5696 + }, + { + "epoch": 1.7696155048452642, + "grad_norm": 0.287109375, + "learning_rate": 0.00011725995304409363, + "loss": 1.43, + "step": 5697 + }, + { + "epoch": 1.7699281025320412, + "grad_norm": 0.310546875, + "learning_rate": 0.00011723575070971459, + "loss": 1.4528, + "step": 5698 + }, + { + "epoch": 1.7702407002188183, + "grad_norm": 0.279296875, + "learning_rate": 0.00011721154733479135, + "loss": 1.8226, + "step": 5699 + }, + { + "epoch": 1.7705532979055953, + "grad_norm": 0.291015625, + "learning_rate": 0.00011718734292078503, + "loss": 1.4686, + "step": 5700 + }, + { + "epoch": 1.7708658955923726, + "grad_norm": 0.298828125, + "learning_rate": 0.00011716313746915696, + "loss": 1.4765, + "step": 5701 + }, + { + "epoch": 1.77117849327915, + "grad_norm": 0.29296875, + "learning_rate": 0.00011713893098136839, + "loss": 1.6228, + "step": 5702 + }, + { + "epoch": 1.771491090965927, + "grad_norm": 0.279296875, + "learning_rate": 0.00011711472345888071, + "loss": 1.4686, + "step": 5703 + }, + { + "epoch": 1.771803688652704, + "grad_norm": 0.287109375, + "learning_rate": 0.00011709051490315534, + "loss": 1.5617, + "step": 5704 + }, + { + "epoch": 1.772116286339481, + "grad_norm": 0.2890625, + "learning_rate": 0.00011706630531565386, + "loss": 1.4651, + "step": 5705 + }, + { + "epoch": 1.772428884026258, + "grad_norm": 0.26953125, + "learning_rate": 0.00011704209469783773, + "loss": 1.6951, + "step": 5706 + }, + { + "epoch": 1.7727414817130354, + "grad_norm": 0.298828125, + "learning_rate": 0.00011701788305116868, + "loss": 1.3827, + "step": 5707 + }, + { + "epoch": 1.7730540793998124, + "grad_norm": 0.27734375, + "learning_rate": 0.00011699367037710829, + "loss": 1.63, + "step": 5708 + }, + { + "epoch": 1.7733666770865897, + "grad_norm": 0.279296875, + "learning_rate": 0.0001169694566771184, + "loss": 1.3955, + "step": 5709 + }, + { + "epoch": 1.7736792747733667, + "grad_norm": 0.29296875, + "learning_rate": 0.00011694524195266077, + "loss": 1.5899, + "step": 5710 + }, + { + "epoch": 1.7739918724601438, + "grad_norm": 0.271484375, + "learning_rate": 0.00011692102620519729, + "loss": 1.1515, + "step": 5711 + }, + { + "epoch": 1.7743044701469208, + "grad_norm": 0.28125, + "learning_rate": 0.0001168968094361899, + "loss": 1.8175, + "step": 5712 + }, + { + "epoch": 1.7746170678336979, + "grad_norm": 0.28515625, + "learning_rate": 0.0001168725916471006, + "loss": 1.6077, + "step": 5713 + }, + { + "epoch": 1.7749296655204752, + "grad_norm": 0.28125, + "learning_rate": 0.00011684837283939143, + "loss": 1.61, + "step": 5714 + }, + { + "epoch": 1.7752422632072524, + "grad_norm": 0.283203125, + "learning_rate": 0.00011682415301452452, + "loss": 1.3214, + "step": 5715 + }, + { + "epoch": 1.7755548608940295, + "grad_norm": 0.296875, + "learning_rate": 0.00011679993217396206, + "loss": 1.6774, + "step": 5716 + }, + { + "epoch": 1.7758674585808065, + "grad_norm": 0.2890625, + "learning_rate": 0.00011677571031916627, + "loss": 1.6886, + "step": 5717 + }, + { + "epoch": 1.7761800562675836, + "grad_norm": 0.2890625, + "learning_rate": 0.00011675148745159949, + "loss": 1.554, + "step": 5718 + }, + { + "epoch": 1.7764926539543606, + "grad_norm": 0.283203125, + "learning_rate": 0.00011672726357272408, + "loss": 1.3875, + "step": 5719 + }, + { + "epoch": 1.776805251641138, + "grad_norm": 0.287109375, + "learning_rate": 0.00011670303868400241, + "loss": 1.2662, + "step": 5720 + }, + { + "epoch": 1.777117849327915, + "grad_norm": 0.287109375, + "learning_rate": 0.00011667881278689705, + "loss": 1.4726, + "step": 5721 + }, + { + "epoch": 1.7774304470146922, + "grad_norm": 0.287109375, + "learning_rate": 0.00011665458588287048, + "loss": 1.4821, + "step": 5722 + }, + { + "epoch": 1.7777430447014693, + "grad_norm": 0.275390625, + "learning_rate": 0.00011663035797338535, + "loss": 1.5008, + "step": 5723 + }, + { + "epoch": 1.7780556423882463, + "grad_norm": 0.279296875, + "learning_rate": 0.00011660612905990431, + "loss": 1.3734, + "step": 5724 + }, + { + "epoch": 1.7783682400750234, + "grad_norm": 0.27734375, + "learning_rate": 0.00011658189914389011, + "loss": 1.869, + "step": 5725 + }, + { + "epoch": 1.7786808377618004, + "grad_norm": 0.275390625, + "learning_rate": 0.00011655766822680552, + "loss": 1.5493, + "step": 5726 + }, + { + "epoch": 1.7789934354485777, + "grad_norm": 0.27734375, + "learning_rate": 0.0001165334363101134, + "loss": 1.5773, + "step": 5727 + }, + { + "epoch": 1.779306033135355, + "grad_norm": 0.28125, + "learning_rate": 0.00011650920339527665, + "loss": 1.2535, + "step": 5728 + }, + { + "epoch": 1.779618630822132, + "grad_norm": 0.314453125, + "learning_rate": 0.00011648496948375827, + "loss": 1.5035, + "step": 5729 + }, + { + "epoch": 1.779931228508909, + "grad_norm": 0.29296875, + "learning_rate": 0.00011646073457702127, + "loss": 1.5281, + "step": 5730 + }, + { + "epoch": 1.780243826195686, + "grad_norm": 0.287109375, + "learning_rate": 0.00011643649867652875, + "loss": 1.5663, + "step": 5731 + }, + { + "epoch": 1.7805564238824632, + "grad_norm": 0.287109375, + "learning_rate": 0.00011641226178374385, + "loss": 1.4889, + "step": 5732 + }, + { + "epoch": 1.7808690215692404, + "grad_norm": 0.2734375, + "learning_rate": 0.0001163880239001298, + "loss": 1.5487, + "step": 5733 + }, + { + "epoch": 1.7811816192560175, + "grad_norm": 0.287109375, + "learning_rate": 0.00011636378502714984, + "loss": 1.4356, + "step": 5734 + }, + { + "epoch": 1.7814942169427948, + "grad_norm": 0.283203125, + "learning_rate": 0.00011633954516626735, + "loss": 1.821, + "step": 5735 + }, + { + "epoch": 1.7818068146295718, + "grad_norm": 0.32421875, + "learning_rate": 0.00011631530431894571, + "loss": 1.3116, + "step": 5736 + }, + { + "epoch": 1.7821194123163488, + "grad_norm": 0.26953125, + "learning_rate": 0.00011629106248664834, + "loss": 1.5322, + "step": 5737 + }, + { + "epoch": 1.782432010003126, + "grad_norm": 0.279296875, + "learning_rate": 0.00011626681967083877, + "loss": 1.7254, + "step": 5738 + }, + { + "epoch": 1.782744607689903, + "grad_norm": 0.28125, + "learning_rate": 0.00011624257587298056, + "loss": 1.6379, + "step": 5739 + }, + { + "epoch": 1.7830572053766802, + "grad_norm": 0.27734375, + "learning_rate": 0.00011621833109453734, + "loss": 1.4709, + "step": 5740 + }, + { + "epoch": 1.7833698030634575, + "grad_norm": 0.287109375, + "learning_rate": 0.00011619408533697282, + "loss": 1.604, + "step": 5741 + }, + { + "epoch": 1.7836824007502345, + "grad_norm": 0.283203125, + "learning_rate": 0.00011616983860175075, + "loss": 1.3792, + "step": 5742 + }, + { + "epoch": 1.7839949984370116, + "grad_norm": 0.283203125, + "learning_rate": 0.0001161455908903349, + "loss": 1.5704, + "step": 5743 + }, + { + "epoch": 1.7843075961237886, + "grad_norm": 0.2734375, + "learning_rate": 0.00011612134220418913, + "loss": 1.5962, + "step": 5744 + }, + { + "epoch": 1.7846201938105657, + "grad_norm": 0.283203125, + "learning_rate": 0.00011609709254477741, + "loss": 1.4397, + "step": 5745 + }, + { + "epoch": 1.784932791497343, + "grad_norm": 0.287109375, + "learning_rate": 0.0001160728419135637, + "loss": 1.5748, + "step": 5746 + }, + { + "epoch": 1.78524538918412, + "grad_norm": 0.28515625, + "learning_rate": 0.00011604859031201202, + "loss": 1.5306, + "step": 5747 + }, + { + "epoch": 1.7855579868708973, + "grad_norm": 0.27734375, + "learning_rate": 0.00011602433774158654, + "loss": 1.6912, + "step": 5748 + }, + { + "epoch": 1.7858705845576743, + "grad_norm": 0.3203125, + "learning_rate": 0.00011600008420375135, + "loss": 1.5453, + "step": 5749 + }, + { + "epoch": 1.7861831822444514, + "grad_norm": 0.287109375, + "learning_rate": 0.00011597582969997067, + "loss": 1.3837, + "step": 5750 + }, + { + "epoch": 1.7864957799312284, + "grad_norm": 0.34765625, + "learning_rate": 0.00011595157423170879, + "loss": 1.457, + "step": 5751 + }, + { + "epoch": 1.7868083776180055, + "grad_norm": 0.287109375, + "learning_rate": 0.00011592731780043005, + "loss": 1.3404, + "step": 5752 + }, + { + "epoch": 1.7871209753047828, + "grad_norm": 0.29296875, + "learning_rate": 0.00011590306040759886, + "loss": 1.4875, + "step": 5753 + }, + { + "epoch": 1.78743357299156, + "grad_norm": 0.27734375, + "learning_rate": 0.00011587880205467965, + "loss": 1.4653, + "step": 5754 + }, + { + "epoch": 1.787746170678337, + "grad_norm": 0.345703125, + "learning_rate": 0.00011585454274313692, + "loss": 2.3537, + "step": 5755 + }, + { + "epoch": 1.7880587683651141, + "grad_norm": 0.2890625, + "learning_rate": 0.00011583028247443522, + "loss": 1.8865, + "step": 5756 + }, + { + "epoch": 1.7883713660518912, + "grad_norm": 0.28515625, + "learning_rate": 0.00011580602125003921, + "loss": 1.7538, + "step": 5757 + }, + { + "epoch": 1.7886839637386682, + "grad_norm": 0.3125, + "learning_rate": 0.00011578175907141354, + "loss": 1.6741, + "step": 5758 + }, + { + "epoch": 1.7889965614254455, + "grad_norm": 0.275390625, + "learning_rate": 0.00011575749594002299, + "loss": 1.5212, + "step": 5759 + }, + { + "epoch": 1.7893091591122225, + "grad_norm": 0.296875, + "learning_rate": 0.00011573323185733231, + "loss": 1.6821, + "step": 5760 + }, + { + "epoch": 1.7896217567989998, + "grad_norm": 0.27734375, + "learning_rate": 0.00011570896682480638, + "loss": 1.3915, + "step": 5761 + }, + { + "epoch": 1.7899343544857769, + "grad_norm": 0.27734375, + "learning_rate": 0.0001156847008439101, + "loss": 1.7713, + "step": 5762 + }, + { + "epoch": 1.790246952172554, + "grad_norm": 0.291015625, + "learning_rate": 0.00011566043391610843, + "loss": 1.6889, + "step": 5763 + }, + { + "epoch": 1.790559549859331, + "grad_norm": 0.302734375, + "learning_rate": 0.00011563616604286643, + "loss": 1.4813, + "step": 5764 + }, + { + "epoch": 1.790872147546108, + "grad_norm": 0.279296875, + "learning_rate": 0.00011561189722564918, + "loss": 1.5978, + "step": 5765 + }, + { + "epoch": 1.7911847452328853, + "grad_norm": 0.2890625, + "learning_rate": 0.00011558762746592175, + "loss": 1.4352, + "step": 5766 + }, + { + "epoch": 1.7914973429196626, + "grad_norm": 0.2734375, + "learning_rate": 0.00011556335676514942, + "loss": 1.4835, + "step": 5767 + }, + { + "epoch": 1.7918099406064396, + "grad_norm": 0.3125, + "learning_rate": 0.00011553908512479738, + "loss": 1.5135, + "step": 5768 + }, + { + "epoch": 1.7921225382932167, + "grad_norm": 0.287109375, + "learning_rate": 0.000115514812546331, + "loss": 1.3119, + "step": 5769 + }, + { + "epoch": 1.7924351359799937, + "grad_norm": 0.294921875, + "learning_rate": 0.00011549053903121561, + "loss": 1.9613, + "step": 5770 + }, + { + "epoch": 1.7927477336667708, + "grad_norm": 0.287109375, + "learning_rate": 0.00011546626458091665, + "loss": 2.0067, + "step": 5771 + }, + { + "epoch": 1.793060331353548, + "grad_norm": 0.279296875, + "learning_rate": 0.00011544198919689957, + "loss": 1.2376, + "step": 5772 + }, + { + "epoch": 1.793372929040325, + "grad_norm": 0.291015625, + "learning_rate": 0.00011541771288062993, + "loss": 1.6036, + "step": 5773 + }, + { + "epoch": 1.7936855267271024, + "grad_norm": 0.294921875, + "learning_rate": 0.0001153934356335733, + "loss": 1.7418, + "step": 5774 + }, + { + "epoch": 1.7939981244138794, + "grad_norm": 0.2734375, + "learning_rate": 0.00011536915745719537, + "loss": 1.5465, + "step": 5775 + }, + { + "epoch": 1.7943107221006565, + "grad_norm": 0.27734375, + "learning_rate": 0.00011534487835296179, + "loss": 1.543, + "step": 5776 + }, + { + "epoch": 1.7946233197874335, + "grad_norm": 0.291015625, + "learning_rate": 0.0001153205983223384, + "loss": 1.5265, + "step": 5777 + }, + { + "epoch": 1.7949359174742106, + "grad_norm": 0.298828125, + "learning_rate": 0.00011529631736679092, + "loss": 1.351, + "step": 5778 + }, + { + "epoch": 1.7952485151609878, + "grad_norm": 0.291015625, + "learning_rate": 0.0001152720354877853, + "loss": 1.6525, + "step": 5779 + }, + { + "epoch": 1.7955611128477649, + "grad_norm": 0.291015625, + "learning_rate": 0.00011524775268678739, + "loss": 1.6968, + "step": 5780 + }, + { + "epoch": 1.7958737105345421, + "grad_norm": 0.27734375, + "learning_rate": 0.00011522346896526326, + "loss": 1.4864, + "step": 5781 + }, + { + "epoch": 1.7961863082213192, + "grad_norm": 0.302734375, + "learning_rate": 0.00011519918432467887, + "loss": 1.6597, + "step": 5782 + }, + { + "epoch": 1.7964989059080962, + "grad_norm": 0.26953125, + "learning_rate": 0.00011517489876650041, + "loss": 1.5225, + "step": 5783 + }, + { + "epoch": 1.7968115035948733, + "grad_norm": 0.275390625, + "learning_rate": 0.00011515061229219392, + "loss": 1.6224, + "step": 5784 + }, + { + "epoch": 1.7971241012816506, + "grad_norm": 0.283203125, + "learning_rate": 0.0001151263249032257, + "loss": 1.3991, + "step": 5785 + }, + { + "epoch": 1.7974366989684276, + "grad_norm": 0.279296875, + "learning_rate": 0.00011510203660106195, + "loss": 1.5783, + "step": 5786 + }, + { + "epoch": 1.7977492966552049, + "grad_norm": 0.279296875, + "learning_rate": 0.00011507774738716901, + "loss": 1.7398, + "step": 5787 + }, + { + "epoch": 1.798061894341982, + "grad_norm": 0.287109375, + "learning_rate": 0.00011505345726301325, + "loss": 1.5499, + "step": 5788 + }, + { + "epoch": 1.798374492028759, + "grad_norm": 0.271484375, + "learning_rate": 0.00011502916623006107, + "loss": 1.4048, + "step": 5789 + }, + { + "epoch": 1.798687089715536, + "grad_norm": 0.29296875, + "learning_rate": 0.00011500487428977901, + "loss": 1.6217, + "step": 5790 + }, + { + "epoch": 1.798999687402313, + "grad_norm": 0.287109375, + "learning_rate": 0.00011498058144363356, + "loss": 1.4457, + "step": 5791 + }, + { + "epoch": 1.7993122850890904, + "grad_norm": 0.29296875, + "learning_rate": 0.00011495628769309128, + "loss": 1.7167, + "step": 5792 + }, + { + "epoch": 1.7996248827758674, + "grad_norm": 0.279296875, + "learning_rate": 0.0001149319930396189, + "loss": 1.4423, + "step": 5793 + }, + { + "epoch": 1.7999374804626447, + "grad_norm": 0.271484375, + "learning_rate": 0.00011490769748468304, + "loss": 1.3099, + "step": 5794 + }, + { + "epoch": 1.8002500781494217, + "grad_norm": 0.29296875, + "learning_rate": 0.00011488340102975051, + "loss": 1.3818, + "step": 5795 + }, + { + "epoch": 1.8005626758361988, + "grad_norm": 0.30078125, + "learning_rate": 0.00011485910367628809, + "loss": 1.5885, + "step": 5796 + }, + { + "epoch": 1.8008752735229758, + "grad_norm": 0.283203125, + "learning_rate": 0.00011483480542576264, + "loss": 1.6998, + "step": 5797 + }, + { + "epoch": 1.801187871209753, + "grad_norm": 0.2890625, + "learning_rate": 0.00011481050627964107, + "loss": 1.4423, + "step": 5798 + }, + { + "epoch": 1.8015004688965301, + "grad_norm": 0.28125, + "learning_rate": 0.00011478620623939039, + "loss": 1.533, + "step": 5799 + }, + { + "epoch": 1.8018130665833074, + "grad_norm": 0.291015625, + "learning_rate": 0.00011476190530647755, + "loss": 1.3115, + "step": 5800 + }, + { + "epoch": 1.8021256642700845, + "grad_norm": 0.29296875, + "learning_rate": 0.00011473760348236973, + "loss": 1.9355, + "step": 5801 + }, + { + "epoch": 1.8024382619568615, + "grad_norm": 0.291015625, + "learning_rate": 0.00011471330076853398, + "loss": 1.6219, + "step": 5802 + }, + { + "epoch": 1.8027508596436386, + "grad_norm": 0.2890625, + "learning_rate": 0.00011468899716643753, + "loss": 1.6172, + "step": 5803 + }, + { + "epoch": 1.8030634573304156, + "grad_norm": 0.28515625, + "learning_rate": 0.00011466469267754756, + "loss": 1.6974, + "step": 5804 + }, + { + "epoch": 1.803376055017193, + "grad_norm": 0.306640625, + "learning_rate": 0.00011464038730333144, + "loss": 1.465, + "step": 5805 + }, + { + "epoch": 1.80368865270397, + "grad_norm": 0.2890625, + "learning_rate": 0.00011461608104525647, + "loss": 1.5435, + "step": 5806 + }, + { + "epoch": 1.8040012503907472, + "grad_norm": 0.302734375, + "learning_rate": 0.00011459177390479008, + "loss": 1.537, + "step": 5807 + }, + { + "epoch": 1.8043138480775243, + "grad_norm": 0.28515625, + "learning_rate": 0.0001145674658833997, + "loss": 1.6696, + "step": 5808 + }, + { + "epoch": 1.8046264457643013, + "grad_norm": 0.283203125, + "learning_rate": 0.00011454315698255284, + "loss": 1.3391, + "step": 5809 + }, + { + "epoch": 1.8049390434510784, + "grad_norm": 0.294921875, + "learning_rate": 0.00011451884720371704, + "loss": 1.4825, + "step": 5810 + }, + { + "epoch": 1.8052516411378556, + "grad_norm": 0.291015625, + "learning_rate": 0.00011449453654835996, + "loss": 1.9999, + "step": 5811 + }, + { + "epoch": 1.8055642388246327, + "grad_norm": 0.26953125, + "learning_rate": 0.00011447022501794924, + "loss": 1.5462, + "step": 5812 + }, + { + "epoch": 1.80587683651141, + "grad_norm": 0.298828125, + "learning_rate": 0.00011444591261395254, + "loss": 1.3408, + "step": 5813 + }, + { + "epoch": 1.806189434198187, + "grad_norm": 0.28515625, + "learning_rate": 0.00011442159933783776, + "loss": 1.5263, + "step": 5814 + }, + { + "epoch": 1.806502031884964, + "grad_norm": 0.287109375, + "learning_rate": 0.00011439728519107262, + "loss": 1.3573, + "step": 5815 + }, + { + "epoch": 1.806814629571741, + "grad_norm": 0.279296875, + "learning_rate": 0.000114372970175125, + "loss": 1.8411, + "step": 5816 + }, + { + "epoch": 1.8071272272585182, + "grad_norm": 0.298828125, + "learning_rate": 0.00011434865429146291, + "loss": 1.6114, + "step": 5817 + }, + { + "epoch": 1.8074398249452954, + "grad_norm": 0.294921875, + "learning_rate": 0.00011432433754155425, + "loss": 1.5172, + "step": 5818 + }, + { + "epoch": 1.8077524226320725, + "grad_norm": 0.275390625, + "learning_rate": 0.00011430001992686705, + "loss": 1.4666, + "step": 5819 + }, + { + "epoch": 1.8080650203188497, + "grad_norm": 0.29296875, + "learning_rate": 0.00011427570144886946, + "loss": 1.7839, + "step": 5820 + }, + { + "epoch": 1.8083776180056268, + "grad_norm": 0.294921875, + "learning_rate": 0.00011425138210902957, + "loss": 1.5083, + "step": 5821 + }, + { + "epoch": 1.8086902156924038, + "grad_norm": 0.291015625, + "learning_rate": 0.00011422706190881557, + "loss": 1.3222, + "step": 5822 + }, + { + "epoch": 1.809002813379181, + "grad_norm": 0.275390625, + "learning_rate": 0.00011420274084969573, + "loss": 1.3257, + "step": 5823 + }, + { + "epoch": 1.809315411065958, + "grad_norm": 0.28125, + "learning_rate": 0.00011417841893313835, + "loss": 1.6301, + "step": 5824 + }, + { + "epoch": 1.8096280087527352, + "grad_norm": 0.294921875, + "learning_rate": 0.0001141540961606117, + "loss": 1.3007, + "step": 5825 + }, + { + "epoch": 1.8099406064395125, + "grad_norm": 0.279296875, + "learning_rate": 0.00011412977253358425, + "loss": 1.9514, + "step": 5826 + }, + { + "epoch": 1.8102532041262895, + "grad_norm": 0.2890625, + "learning_rate": 0.00011410544805352444, + "loss": 1.4303, + "step": 5827 + }, + { + "epoch": 1.8105658018130666, + "grad_norm": 0.2890625, + "learning_rate": 0.00011408112272190072, + "loss": 1.8044, + "step": 5828 + }, + { + "epoch": 1.8108783994998436, + "grad_norm": 0.296875, + "learning_rate": 0.00011405679654018171, + "loss": 1.5914, + "step": 5829 + }, + { + "epoch": 1.8111909971866207, + "grad_norm": 0.294921875, + "learning_rate": 0.000114032469509836, + "loss": 1.6985, + "step": 5830 + }, + { + "epoch": 1.811503594873398, + "grad_norm": 0.3046875, + "learning_rate": 0.00011400814163233217, + "loss": 1.4168, + "step": 5831 + }, + { + "epoch": 1.811816192560175, + "grad_norm": 0.296875, + "learning_rate": 0.00011398381290913902, + "loss": 1.5645, + "step": 5832 + }, + { + "epoch": 1.8121287902469523, + "grad_norm": 0.275390625, + "learning_rate": 0.00011395948334172524, + "loss": 1.7897, + "step": 5833 + }, + { + "epoch": 1.8124413879337293, + "grad_norm": 0.30859375, + "learning_rate": 0.00011393515293155967, + "loss": 1.7071, + "step": 5834 + }, + { + "epoch": 1.8127539856205064, + "grad_norm": 0.2890625, + "learning_rate": 0.00011391082168011116, + "loss": 1.659, + "step": 5835 + }, + { + "epoch": 1.8130665833072834, + "grad_norm": 0.279296875, + "learning_rate": 0.00011388648958884862, + "loss": 1.4107, + "step": 5836 + }, + { + "epoch": 1.8133791809940605, + "grad_norm": 0.291015625, + "learning_rate": 0.000113862156659241, + "loss": 1.3115, + "step": 5837 + }, + { + "epoch": 1.8136917786808378, + "grad_norm": 0.291015625, + "learning_rate": 0.00011383782289275733, + "loss": 1.3619, + "step": 5838 + }, + { + "epoch": 1.814004376367615, + "grad_norm": 0.357421875, + "learning_rate": 0.00011381348829086665, + "loss": 2.0378, + "step": 5839 + }, + { + "epoch": 1.814316974054392, + "grad_norm": 0.291015625, + "learning_rate": 0.00011378915285503808, + "loss": 1.4385, + "step": 5840 + }, + { + "epoch": 1.8146295717411691, + "grad_norm": 0.294921875, + "learning_rate": 0.00011376481658674079, + "loss": 1.6725, + "step": 5841 + }, + { + "epoch": 1.8149421694279462, + "grad_norm": 0.2734375, + "learning_rate": 0.00011374047948744398, + "loss": 1.6127, + "step": 5842 + }, + { + "epoch": 1.8152547671147232, + "grad_norm": 0.296875, + "learning_rate": 0.0001137161415586169, + "loss": 1.7975, + "step": 5843 + }, + { + "epoch": 1.8155673648015005, + "grad_norm": 0.28125, + "learning_rate": 0.00011369180280172888, + "loss": 1.6108, + "step": 5844 + }, + { + "epoch": 1.8158799624882775, + "grad_norm": 0.27734375, + "learning_rate": 0.00011366746321824927, + "loss": 1.5536, + "step": 5845 + }, + { + "epoch": 1.8161925601750548, + "grad_norm": 0.296875, + "learning_rate": 0.00011364312280964751, + "loss": 1.4006, + "step": 5846 + }, + { + "epoch": 1.8165051578618319, + "grad_norm": 0.294921875, + "learning_rate": 0.00011361878157739302, + "loss": 1.3971, + "step": 5847 + }, + { + "epoch": 1.816817755548609, + "grad_norm": 0.279296875, + "learning_rate": 0.00011359443952295537, + "loss": 1.6335, + "step": 5848 + }, + { + "epoch": 1.817130353235386, + "grad_norm": 0.296875, + "learning_rate": 0.00011357009664780404, + "loss": 1.5704, + "step": 5849 + }, + { + "epoch": 1.817442950922163, + "grad_norm": 0.296875, + "learning_rate": 0.0001135457529534087, + "loss": 1.7051, + "step": 5850 + }, + { + "epoch": 1.8177555486089403, + "grad_norm": 0.2890625, + "learning_rate": 0.00011352140844123897, + "loss": 1.7543, + "step": 5851 + }, + { + "epoch": 1.8180681462957176, + "grad_norm": 0.2890625, + "learning_rate": 0.00011349706311276462, + "loss": 1.7139, + "step": 5852 + }, + { + "epoch": 1.8183807439824946, + "grad_norm": 0.296875, + "learning_rate": 0.00011347271696945536, + "loss": 1.5561, + "step": 5853 + }, + { + "epoch": 1.8186933416692717, + "grad_norm": 0.2734375, + "learning_rate": 0.00011344837001278101, + "loss": 1.7376, + "step": 5854 + }, + { + "epoch": 1.8190059393560487, + "grad_norm": 0.291015625, + "learning_rate": 0.0001134240222442114, + "loss": 1.4673, + "step": 5855 + }, + { + "epoch": 1.8193185370428258, + "grad_norm": 0.2890625, + "learning_rate": 0.00011339967366521648, + "loss": 1.9459, + "step": 5856 + }, + { + "epoch": 1.819631134729603, + "grad_norm": 0.28515625, + "learning_rate": 0.00011337532427726616, + "loss": 1.5291, + "step": 5857 + }, + { + "epoch": 1.81994373241638, + "grad_norm": 0.291015625, + "learning_rate": 0.00011335097408183051, + "loss": 1.5755, + "step": 5858 + }, + { + "epoch": 1.8202563301031573, + "grad_norm": 0.28515625, + "learning_rate": 0.0001133266230803795, + "loss": 1.6344, + "step": 5859 + }, + { + "epoch": 1.8205689277899344, + "grad_norm": 0.28125, + "learning_rate": 0.00011330227127438333, + "loss": 1.5934, + "step": 5860 + }, + { + "epoch": 1.8208815254767114, + "grad_norm": 0.265625, + "learning_rate": 0.00011327791866531203, + "loss": 1.6294, + "step": 5861 + }, + { + "epoch": 1.8211941231634885, + "grad_norm": 0.28125, + "learning_rate": 0.00011325356525463591, + "loss": 1.8007, + "step": 5862 + }, + { + "epoch": 1.8215067208502655, + "grad_norm": 0.28515625, + "learning_rate": 0.00011322921104382511, + "loss": 1.33, + "step": 5863 + }, + { + "epoch": 1.8218193185370428, + "grad_norm": 0.291015625, + "learning_rate": 0.00011320485603435002, + "loss": 1.7424, + "step": 5864 + }, + { + "epoch": 1.82213191622382, + "grad_norm": 0.283203125, + "learning_rate": 0.00011318050022768096, + "loss": 1.6708, + "step": 5865 + }, + { + "epoch": 1.8224445139105971, + "grad_norm": 0.28515625, + "learning_rate": 0.00011315614362528828, + "loss": 1.6851, + "step": 5866 + }, + { + "epoch": 1.8227571115973742, + "grad_norm": 0.28125, + "learning_rate": 0.00011313178622864245, + "loss": 1.4527, + "step": 5867 + }, + { + "epoch": 1.8230697092841512, + "grad_norm": 0.27734375, + "learning_rate": 0.00011310742803921395, + "loss": 1.3163, + "step": 5868 + }, + { + "epoch": 1.8233823069709283, + "grad_norm": 0.291015625, + "learning_rate": 0.0001130830690584733, + "loss": 1.9049, + "step": 5869 + }, + { + "epoch": 1.8236949046577056, + "grad_norm": 0.28125, + "learning_rate": 0.0001130587092878911, + "loss": 1.4765, + "step": 5870 + }, + { + "epoch": 1.8240075023444826, + "grad_norm": 0.296875, + "learning_rate": 0.00011303434872893801, + "loss": 1.6214, + "step": 5871 + }, + { + "epoch": 1.8243201000312599, + "grad_norm": 0.291015625, + "learning_rate": 0.00011300998738308468, + "loss": 1.6034, + "step": 5872 + }, + { + "epoch": 1.824632697718037, + "grad_norm": 0.279296875, + "learning_rate": 0.00011298562525180178, + "loss": 1.298, + "step": 5873 + }, + { + "epoch": 1.824945295404814, + "grad_norm": 0.291015625, + "learning_rate": 0.0001129612623365602, + "loss": 1.6643, + "step": 5874 + }, + { + "epoch": 1.825257893091591, + "grad_norm": 0.2890625, + "learning_rate": 0.00011293689863883063, + "loss": 1.494, + "step": 5875 + }, + { + "epoch": 1.825570490778368, + "grad_norm": 0.306640625, + "learning_rate": 0.00011291253416008405, + "loss": 1.6174, + "step": 5876 + }, + { + "epoch": 1.8258830884651454, + "grad_norm": 0.26953125, + "learning_rate": 0.00011288816890179132, + "loss": 1.6457, + "step": 5877 + }, + { + "epoch": 1.8261956861519226, + "grad_norm": 0.28125, + "learning_rate": 0.00011286380286542342, + "loss": 1.4945, + "step": 5878 + }, + { + "epoch": 1.8265082838386997, + "grad_norm": 0.279296875, + "learning_rate": 0.00011283943605245133, + "loss": 1.5943, + "step": 5879 + }, + { + "epoch": 1.8268208815254767, + "grad_norm": 0.3046875, + "learning_rate": 0.00011281506846434613, + "loss": 1.4638, + "step": 5880 + }, + { + "epoch": 1.8271334792122538, + "grad_norm": 0.298828125, + "learning_rate": 0.0001127907001025789, + "loss": 1.7105, + "step": 5881 + }, + { + "epoch": 1.8274460768990308, + "grad_norm": 0.291015625, + "learning_rate": 0.00011276633096862083, + "loss": 1.4812, + "step": 5882 + }, + { + "epoch": 1.827758674585808, + "grad_norm": 0.271484375, + "learning_rate": 0.00011274196106394308, + "loss": 1.2702, + "step": 5883 + }, + { + "epoch": 1.8280712722725851, + "grad_norm": 0.291015625, + "learning_rate": 0.00011271759039001693, + "loss": 1.578, + "step": 5884 + }, + { + "epoch": 1.8283838699593624, + "grad_norm": 0.28125, + "learning_rate": 0.0001126932189483136, + "loss": 1.7245, + "step": 5885 + }, + { + "epoch": 1.8286964676461395, + "grad_norm": 0.28515625, + "learning_rate": 0.00011266884674030449, + "loss": 1.7638, + "step": 5886 + }, + { + "epoch": 1.8290090653329165, + "grad_norm": 0.30078125, + "learning_rate": 0.00011264447376746095, + "loss": 1.7444, + "step": 5887 + }, + { + "epoch": 1.8293216630196936, + "grad_norm": 0.294921875, + "learning_rate": 0.00011262010003125443, + "loss": 1.4711, + "step": 5888 + }, + { + "epoch": 1.8296342607064706, + "grad_norm": 0.283203125, + "learning_rate": 0.00011259572553315636, + "loss": 1.6199, + "step": 5889 + }, + { + "epoch": 1.8299468583932479, + "grad_norm": 0.298828125, + "learning_rate": 0.00011257135027463832, + "loss": 1.4648, + "step": 5890 + }, + { + "epoch": 1.8302594560800252, + "grad_norm": 0.296875, + "learning_rate": 0.00011254697425717182, + "loss": 1.5086, + "step": 5891 + }, + { + "epoch": 1.8305720537668022, + "grad_norm": 0.287109375, + "learning_rate": 0.0001125225974822285, + "loss": 1.6374, + "step": 5892 + }, + { + "epoch": 1.8308846514535793, + "grad_norm": 0.283203125, + "learning_rate": 0.00011249821995128003, + "loss": 1.5425, + "step": 5893 + }, + { + "epoch": 1.8311972491403563, + "grad_norm": 0.2734375, + "learning_rate": 0.00011247384166579808, + "loss": 1.3473, + "step": 5894 + }, + { + "epoch": 1.8315098468271334, + "grad_norm": 0.275390625, + "learning_rate": 0.00011244946262725442, + "loss": 1.4025, + "step": 5895 + }, + { + "epoch": 1.8318224445139106, + "grad_norm": 0.28515625, + "learning_rate": 0.00011242508283712084, + "loss": 1.6087, + "step": 5896 + }, + { + "epoch": 1.8321350422006877, + "grad_norm": 0.30078125, + "learning_rate": 0.00011240070229686916, + "loss": 1.544, + "step": 5897 + }, + { + "epoch": 1.832447639887465, + "grad_norm": 0.279296875, + "learning_rate": 0.00011237632100797129, + "loss": 1.6119, + "step": 5898 + }, + { + "epoch": 1.832760237574242, + "grad_norm": 0.30078125, + "learning_rate": 0.00011235193897189913, + "loss": 1.4214, + "step": 5899 + }, + { + "epoch": 1.833072835261019, + "grad_norm": 0.283203125, + "learning_rate": 0.00011232755619012469, + "loss": 1.4287, + "step": 5900 + }, + { + "epoch": 1.833385432947796, + "grad_norm": 0.283203125, + "learning_rate": 0.00011230317266412, + "loss": 1.2955, + "step": 5901 + }, + { + "epoch": 1.8336980306345732, + "grad_norm": 0.279296875, + "learning_rate": 0.00011227878839535708, + "loss": 1.7294, + "step": 5902 + }, + { + "epoch": 1.8340106283213504, + "grad_norm": 0.283203125, + "learning_rate": 0.00011225440338530804, + "loss": 1.3389, + "step": 5903 + }, + { + "epoch": 1.8343232260081277, + "grad_norm": 0.291015625, + "learning_rate": 0.00011223001763544507, + "loss": 1.572, + "step": 5904 + }, + { + "epoch": 1.8346358236949047, + "grad_norm": 0.275390625, + "learning_rate": 0.00011220563114724032, + "loss": 1.491, + "step": 5905 + }, + { + "epoch": 1.8349484213816818, + "grad_norm": 0.283203125, + "learning_rate": 0.00011218124392216609, + "loss": 1.7087, + "step": 5906 + }, + { + "epoch": 1.8352610190684588, + "grad_norm": 0.294921875, + "learning_rate": 0.00011215685596169465, + "loss": 1.8135, + "step": 5907 + }, + { + "epoch": 1.835573616755236, + "grad_norm": 0.2734375, + "learning_rate": 0.00011213246726729832, + "loss": 1.5235, + "step": 5908 + }, + { + "epoch": 1.8358862144420132, + "grad_norm": 0.28125, + "learning_rate": 0.00011210807784044945, + "loss": 1.5631, + "step": 5909 + }, + { + "epoch": 1.8361988121287902, + "grad_norm": 0.28515625, + "learning_rate": 0.00011208368768262054, + "loss": 1.6162, + "step": 5910 + }, + { + "epoch": 1.8365114098155675, + "grad_norm": 0.294921875, + "learning_rate": 0.00011205929679528395, + "loss": 1.5487, + "step": 5911 + }, + { + "epoch": 1.8368240075023445, + "grad_norm": 0.28515625, + "learning_rate": 0.00011203490517991231, + "loss": 1.7964, + "step": 5912 + }, + { + "epoch": 1.8371366051891216, + "grad_norm": 0.291015625, + "learning_rate": 0.00011201051283797808, + "loss": 1.8237, + "step": 5913 + }, + { + "epoch": 1.8374492028758986, + "grad_norm": 0.3046875, + "learning_rate": 0.00011198611977095388, + "loss": 1.7002, + "step": 5914 + }, + { + "epoch": 1.8377618005626757, + "grad_norm": 0.2890625, + "learning_rate": 0.00011196172598031233, + "loss": 1.6531, + "step": 5915 + }, + { + "epoch": 1.838074398249453, + "grad_norm": 0.28515625, + "learning_rate": 0.00011193733146752618, + "loss": 1.4178, + "step": 5916 + }, + { + "epoch": 1.8383869959362302, + "grad_norm": 0.26171875, + "learning_rate": 0.00011191293623406809, + "loss": 1.6298, + "step": 5917 + }, + { + "epoch": 1.8386995936230073, + "grad_norm": 0.30078125, + "learning_rate": 0.00011188854028141089, + "loss": 1.5161, + "step": 5918 + }, + { + "epoch": 1.8390121913097843, + "grad_norm": 0.287109375, + "learning_rate": 0.00011186414361102732, + "loss": 1.4156, + "step": 5919 + }, + { + "epoch": 1.8393247889965614, + "grad_norm": 0.294921875, + "learning_rate": 0.00011183974622439033, + "loss": 1.4677, + "step": 5920 + }, + { + "epoch": 1.8396373866833384, + "grad_norm": 0.294921875, + "learning_rate": 0.00011181534812297272, + "loss": 1.4706, + "step": 5921 + }, + { + "epoch": 1.8399499843701157, + "grad_norm": 0.283203125, + "learning_rate": 0.00011179094930824754, + "loss": 1.5606, + "step": 5922 + }, + { + "epoch": 1.8402625820568927, + "grad_norm": 0.27734375, + "learning_rate": 0.00011176654978168768, + "loss": 1.4768, + "step": 5923 + }, + { + "epoch": 1.84057517974367, + "grad_norm": 0.29296875, + "learning_rate": 0.00011174214954476625, + "loss": 1.597, + "step": 5924 + }, + { + "epoch": 1.840887777430447, + "grad_norm": 0.291015625, + "learning_rate": 0.00011171774859895628, + "loss": 1.5222, + "step": 5925 + }, + { + "epoch": 1.8412003751172241, + "grad_norm": 0.271484375, + "learning_rate": 0.00011169334694573088, + "loss": 1.6, + "step": 5926 + }, + { + "epoch": 1.8415129728040012, + "grad_norm": 0.29296875, + "learning_rate": 0.00011166894458656323, + "loss": 1.6754, + "step": 5927 + }, + { + "epoch": 1.8418255704907782, + "grad_norm": 0.29296875, + "learning_rate": 0.00011164454152292653, + "loss": 1.5128, + "step": 5928 + }, + { + "epoch": 1.8421381681775555, + "grad_norm": 0.29296875, + "learning_rate": 0.00011162013775629401, + "loss": 1.4511, + "step": 5929 + }, + { + "epoch": 1.8424507658643328, + "grad_norm": 0.298828125, + "learning_rate": 0.000111595733288139, + "loss": 1.6394, + "step": 5930 + }, + { + "epoch": 1.8427633635511098, + "grad_norm": 0.283203125, + "learning_rate": 0.00011157132811993475, + "loss": 1.6909, + "step": 5931 + }, + { + "epoch": 1.8430759612378869, + "grad_norm": 0.279296875, + "learning_rate": 0.00011154692225315471, + "loss": 1.4136, + "step": 5932 + }, + { + "epoch": 1.843388558924664, + "grad_norm": 0.28125, + "learning_rate": 0.00011152251568927223, + "loss": 1.742, + "step": 5933 + }, + { + "epoch": 1.843701156611441, + "grad_norm": 0.306640625, + "learning_rate": 0.00011149810842976081, + "loss": 1.4884, + "step": 5934 + }, + { + "epoch": 1.8440137542982182, + "grad_norm": 0.294921875, + "learning_rate": 0.00011147370047609391, + "loss": 1.5825, + "step": 5935 + }, + { + "epoch": 1.8443263519849953, + "grad_norm": 0.271484375, + "learning_rate": 0.00011144929182974515, + "loss": 1.4863, + "step": 5936 + }, + { + "epoch": 1.8446389496717726, + "grad_norm": 0.29296875, + "learning_rate": 0.000111424882492188, + "loss": 1.4343, + "step": 5937 + }, + { + "epoch": 1.8449515473585496, + "grad_norm": 0.279296875, + "learning_rate": 0.00011140047246489616, + "loss": 1.487, + "step": 5938 + }, + { + "epoch": 1.8452641450453267, + "grad_norm": 0.30078125, + "learning_rate": 0.00011137606174934323, + "loss": 1.6822, + "step": 5939 + }, + { + "epoch": 1.8455767427321037, + "grad_norm": 0.28515625, + "learning_rate": 0.00011135165034700299, + "loss": 1.6089, + "step": 5940 + }, + { + "epoch": 1.8458893404188808, + "grad_norm": 0.27734375, + "learning_rate": 0.00011132723825934914, + "loss": 1.6872, + "step": 5941 + }, + { + "epoch": 1.846201938105658, + "grad_norm": 0.29296875, + "learning_rate": 0.0001113028254878555, + "loss": 1.5907, + "step": 5942 + }, + { + "epoch": 1.8465145357924353, + "grad_norm": 0.298828125, + "learning_rate": 0.00011127841203399587, + "loss": 1.5709, + "step": 5943 + }, + { + "epoch": 1.8468271334792123, + "grad_norm": 0.30078125, + "learning_rate": 0.00011125399789924415, + "loss": 1.5323, + "step": 5944 + }, + { + "epoch": 1.8471397311659894, + "grad_norm": 0.27734375, + "learning_rate": 0.0001112295830850742, + "loss": 1.4699, + "step": 5945 + }, + { + "epoch": 1.8474523288527664, + "grad_norm": 0.2890625, + "learning_rate": 0.00011120516759296005, + "loss": 1.8765, + "step": 5946 + }, + { + "epoch": 1.8477649265395435, + "grad_norm": 0.279296875, + "learning_rate": 0.0001111807514243756, + "loss": 1.3026, + "step": 5947 + }, + { + "epoch": 1.8480775242263208, + "grad_norm": 0.287109375, + "learning_rate": 0.00011115633458079501, + "loss": 1.4757, + "step": 5948 + }, + { + "epoch": 1.8483901219130978, + "grad_norm": 0.279296875, + "learning_rate": 0.00011113191706369224, + "loss": 1.5611, + "step": 5949 + }, + { + "epoch": 1.848702719599875, + "grad_norm": 0.318359375, + "learning_rate": 0.00011110749887454145, + "loss": 2.0363, + "step": 5950 + }, + { + "epoch": 1.8490153172866521, + "grad_norm": 0.296875, + "learning_rate": 0.0001110830800148168, + "loss": 1.5084, + "step": 5951 + }, + { + "epoch": 1.8493279149734292, + "grad_norm": 0.2890625, + "learning_rate": 0.0001110586604859925, + "loss": 1.335, + "step": 5952 + }, + { + "epoch": 1.8496405126602062, + "grad_norm": 0.28515625, + "learning_rate": 0.00011103424028954275, + "loss": 1.5252, + "step": 5953 + }, + { + "epoch": 1.8499531103469833, + "grad_norm": 0.294921875, + "learning_rate": 0.0001110098194269419, + "loss": 1.6241, + "step": 5954 + }, + { + "epoch": 1.8502657080337606, + "grad_norm": 0.287109375, + "learning_rate": 0.00011098539789966418, + "loss": 1.674, + "step": 5955 + }, + { + "epoch": 1.8505783057205378, + "grad_norm": 0.283203125, + "learning_rate": 0.000110960975709184, + "loss": 1.265, + "step": 5956 + }, + { + "epoch": 1.8508909034073149, + "grad_norm": 0.271484375, + "learning_rate": 0.00011093655285697573, + "loss": 1.4216, + "step": 5957 + }, + { + "epoch": 1.851203501094092, + "grad_norm": 0.294921875, + "learning_rate": 0.00011091212934451387, + "loss": 1.6109, + "step": 5958 + }, + { + "epoch": 1.851516098780869, + "grad_norm": 0.283203125, + "learning_rate": 0.00011088770517327283, + "loss": 1.5298, + "step": 5959 + }, + { + "epoch": 1.851828696467646, + "grad_norm": 0.259765625, + "learning_rate": 0.00011086328034472717, + "loss": 1.6639, + "step": 5960 + }, + { + "epoch": 1.8521412941544233, + "grad_norm": 0.294921875, + "learning_rate": 0.0001108388548603514, + "loss": 1.5286, + "step": 5961 + }, + { + "epoch": 1.8524538918412004, + "grad_norm": 0.29296875, + "learning_rate": 0.00011081442872162018, + "loss": 1.6964, + "step": 5962 + }, + { + "epoch": 1.8527664895279776, + "grad_norm": 0.26953125, + "learning_rate": 0.00011079000193000808, + "loss": 1.3428, + "step": 5963 + }, + { + "epoch": 1.8530790872147547, + "grad_norm": 0.287109375, + "learning_rate": 0.00011076557448698985, + "loss": 1.4291, + "step": 5964 + }, + { + "epoch": 1.8533916849015317, + "grad_norm": 0.279296875, + "learning_rate": 0.00011074114639404016, + "loss": 1.6868, + "step": 5965 + }, + { + "epoch": 1.8537042825883088, + "grad_norm": 0.279296875, + "learning_rate": 0.0001107167176526338, + "loss": 1.5273, + "step": 5966 + }, + { + "epoch": 1.8540168802750858, + "grad_norm": 0.271484375, + "learning_rate": 0.00011069228826424548, + "loss": 1.2799, + "step": 5967 + }, + { + "epoch": 1.854329477961863, + "grad_norm": 0.27734375, + "learning_rate": 0.00011066785823035015, + "loss": 1.8009, + "step": 5968 + }, + { + "epoch": 1.8546420756486404, + "grad_norm": 0.35546875, + "learning_rate": 0.00011064342755242259, + "loss": 2.1242, + "step": 5969 + }, + { + "epoch": 1.8549546733354174, + "grad_norm": 0.27734375, + "learning_rate": 0.00011061899623193774, + "loss": 1.8559, + "step": 5970 + }, + { + "epoch": 1.8552672710221945, + "grad_norm": 0.2890625, + "learning_rate": 0.00011059456427037059, + "loss": 1.5747, + "step": 5971 + }, + { + "epoch": 1.8555798687089715, + "grad_norm": 0.3203125, + "learning_rate": 0.00011057013166919607, + "loss": 1.7598, + "step": 5972 + }, + { + "epoch": 1.8558924663957486, + "grad_norm": 0.28515625, + "learning_rate": 0.00011054569842988925, + "loss": 1.7396, + "step": 5973 + }, + { + "epoch": 1.8562050640825258, + "grad_norm": 0.28125, + "learning_rate": 0.00011052126455392515, + "loss": 1.4872, + "step": 5974 + }, + { + "epoch": 1.8565176617693029, + "grad_norm": 0.287109375, + "learning_rate": 0.00011049683004277892, + "loss": 1.3617, + "step": 5975 + }, + { + "epoch": 1.8568302594560802, + "grad_norm": 0.28125, + "learning_rate": 0.00011047239489792567, + "loss": 1.3131, + "step": 5976 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.296875, + "learning_rate": 0.00011044795912084063, + "loss": 1.7335, + "step": 5977 + }, + { + "epoch": 1.8574554548296343, + "grad_norm": 0.314453125, + "learning_rate": 0.00011042352271299897, + "loss": 1.5964, + "step": 5978 + }, + { + "epoch": 1.8577680525164113, + "grad_norm": 0.291015625, + "learning_rate": 0.00011039908567587591, + "loss": 1.5923, + "step": 5979 + }, + { + "epoch": 1.8580806502031884, + "grad_norm": 0.2890625, + "learning_rate": 0.00011037464801094686, + "loss": 1.4119, + "step": 5980 + }, + { + "epoch": 1.8583932478899656, + "grad_norm": 0.28515625, + "learning_rate": 0.00011035020971968704, + "loss": 1.6322, + "step": 5981 + }, + { + "epoch": 1.8587058455767427, + "grad_norm": 0.287109375, + "learning_rate": 0.00011032577080357189, + "loss": 1.7092, + "step": 5982 + }, + { + "epoch": 1.85901844326352, + "grad_norm": 0.30078125, + "learning_rate": 0.00011030133126407682, + "loss": 1.6851, + "step": 5983 + }, + { + "epoch": 1.859331040950297, + "grad_norm": 0.37109375, + "learning_rate": 0.00011027689110267724, + "loss": 2.1295, + "step": 5984 + }, + { + "epoch": 1.859643638637074, + "grad_norm": 0.30078125, + "learning_rate": 0.00011025245032084863, + "loss": 1.6702, + "step": 5985 + }, + { + "epoch": 1.859956236323851, + "grad_norm": 0.29296875, + "learning_rate": 0.00011022800892006655, + "loss": 1.544, + "step": 5986 + }, + { + "epoch": 1.8602688340106284, + "grad_norm": 0.2890625, + "learning_rate": 0.00011020356690180653, + "loss": 1.38, + "step": 5987 + }, + { + "epoch": 1.8605814316974054, + "grad_norm": 0.2734375, + "learning_rate": 0.00011017912426754417, + "loss": 1.6264, + "step": 5988 + }, + { + "epoch": 1.8608940293841827, + "grad_norm": 0.28515625, + "learning_rate": 0.00011015468101875512, + "loss": 1.4654, + "step": 5989 + }, + { + "epoch": 1.8612066270709597, + "grad_norm": 0.29296875, + "learning_rate": 0.00011013023715691504, + "loss": 1.5018, + "step": 5990 + }, + { + "epoch": 1.8615192247577368, + "grad_norm": 0.296875, + "learning_rate": 0.00011010579268349961, + "loss": 1.4131, + "step": 5991 + }, + { + "epoch": 1.8618318224445138, + "grad_norm": 0.2734375, + "learning_rate": 0.00011008134759998464, + "loss": 1.6969, + "step": 5992 + }, + { + "epoch": 1.862144420131291, + "grad_norm": 0.2890625, + "learning_rate": 0.00011005690190784582, + "loss": 1.7404, + "step": 5993 + }, + { + "epoch": 1.8624570178180682, + "grad_norm": 0.296875, + "learning_rate": 0.00011003245560855901, + "loss": 1.2808, + "step": 5994 + }, + { + "epoch": 1.8627696155048452, + "grad_norm": 0.29296875, + "learning_rate": 0.00011000800870360013, + "loss": 1.7481, + "step": 5995 + }, + { + "epoch": 1.8630822131916225, + "grad_norm": 0.30078125, + "learning_rate": 0.00010998356119444497, + "loss": 1.3703, + "step": 5996 + }, + { + "epoch": 1.8633948108783995, + "grad_norm": 0.283203125, + "learning_rate": 0.0001099591130825695, + "loss": 1.651, + "step": 5997 + }, + { + "epoch": 1.8637074085651766, + "grad_norm": 0.2890625, + "learning_rate": 0.00010993466436944968, + "loss": 1.6761, + "step": 5998 + }, + { + "epoch": 1.8640200062519536, + "grad_norm": 0.296875, + "learning_rate": 0.00010991021505656152, + "loss": 1.4074, + "step": 5999 + }, + { + "epoch": 1.864332603938731, + "grad_norm": 0.28515625, + "learning_rate": 0.00010988576514538105, + "loss": 1.3369, + "step": 6000 + }, + { + "epoch": 1.864645201625508, + "grad_norm": 0.28125, + "learning_rate": 0.00010986131463738434, + "loss": 1.3919, + "step": 6001 + }, + { + "epoch": 1.8649577993122852, + "grad_norm": 0.27734375, + "learning_rate": 0.0001098368635340475, + "loss": 1.6825, + "step": 6002 + }, + { + "epoch": 1.8652703969990623, + "grad_norm": 0.2890625, + "learning_rate": 0.00010981241183684667, + "loss": 1.5925, + "step": 6003 + }, + { + "epoch": 1.8655829946858393, + "grad_norm": 0.27734375, + "learning_rate": 0.00010978795954725804, + "loss": 1.5749, + "step": 6004 + }, + { + "epoch": 1.8658955923726164, + "grad_norm": 0.28515625, + "learning_rate": 0.00010976350666675781, + "loss": 1.7939, + "step": 6005 + }, + { + "epoch": 1.8662081900593934, + "grad_norm": 0.287109375, + "learning_rate": 0.00010973905319682223, + "loss": 1.806, + "step": 6006 + }, + { + "epoch": 1.8665207877461707, + "grad_norm": 0.279296875, + "learning_rate": 0.00010971459913892763, + "loss": 1.5721, + "step": 6007 + }, + { + "epoch": 1.8668333854329477, + "grad_norm": 0.283203125, + "learning_rate": 0.00010969014449455026, + "loss": 1.5858, + "step": 6008 + }, + { + "epoch": 1.867145983119725, + "grad_norm": 0.3046875, + "learning_rate": 0.00010966568926516656, + "loss": 1.7618, + "step": 6009 + }, + { + "epoch": 1.867458580806502, + "grad_norm": 0.28125, + "learning_rate": 0.00010964123345225285, + "loss": 1.4188, + "step": 6010 + }, + { + "epoch": 1.8677711784932791, + "grad_norm": 0.28125, + "learning_rate": 0.00010961677705728561, + "loss": 1.5006, + "step": 6011 + }, + { + "epoch": 1.8680837761800562, + "grad_norm": 0.294921875, + "learning_rate": 0.00010959232008174127, + "loss": 1.554, + "step": 6012 + }, + { + "epoch": 1.8683963738668334, + "grad_norm": 0.291015625, + "learning_rate": 0.00010956786252709637, + "loss": 1.3672, + "step": 6013 + }, + { + "epoch": 1.8687089715536105, + "grad_norm": 0.271484375, + "learning_rate": 0.00010954340439482738, + "loss": 1.2607, + "step": 6014 + }, + { + "epoch": 1.8690215692403878, + "grad_norm": 0.28515625, + "learning_rate": 0.00010951894568641093, + "loss": 1.4708, + "step": 6015 + }, + { + "epoch": 1.8693341669271648, + "grad_norm": 0.2890625, + "learning_rate": 0.0001094944864033236, + "loss": 1.6332, + "step": 6016 + }, + { + "epoch": 1.8696467646139419, + "grad_norm": 0.283203125, + "learning_rate": 0.00010947002654704201, + "loss": 1.5205, + "step": 6017 + }, + { + "epoch": 1.869959362300719, + "grad_norm": 0.296875, + "learning_rate": 0.00010944556611904286, + "loss": 1.5209, + "step": 6018 + }, + { + "epoch": 1.870271959987496, + "grad_norm": 0.298828125, + "learning_rate": 0.00010942110512080286, + "loss": 1.4901, + "step": 6019 + }, + { + "epoch": 1.8705845576742732, + "grad_norm": 0.28125, + "learning_rate": 0.0001093966435537987, + "loss": 1.582, + "step": 6020 + }, + { + "epoch": 1.8708971553610503, + "grad_norm": 0.283203125, + "learning_rate": 0.00010937218141950722, + "loss": 1.868, + "step": 6021 + }, + { + "epoch": 1.8712097530478276, + "grad_norm": 0.3046875, + "learning_rate": 0.00010934771871940521, + "loss": 1.5889, + "step": 6022 + }, + { + "epoch": 1.8715223507346046, + "grad_norm": 0.28125, + "learning_rate": 0.00010932325545496948, + "loss": 1.6362, + "step": 6023 + }, + { + "epoch": 1.8718349484213817, + "grad_norm": 0.296875, + "learning_rate": 0.00010929879162767698, + "loss": 1.4245, + "step": 6024 + }, + { + "epoch": 1.8721475461081587, + "grad_norm": 0.28125, + "learning_rate": 0.00010927432723900457, + "loss": 1.3501, + "step": 6025 + }, + { + "epoch": 1.872460143794936, + "grad_norm": 0.283203125, + "learning_rate": 0.00010924986229042917, + "loss": 1.6494, + "step": 6026 + }, + { + "epoch": 1.872772741481713, + "grad_norm": 0.287109375, + "learning_rate": 0.00010922539678342785, + "loss": 1.6763, + "step": 6027 + }, + { + "epoch": 1.8730853391684903, + "grad_norm": 0.271484375, + "learning_rate": 0.00010920093071947755, + "loss": 1.3311, + "step": 6028 + }, + { + "epoch": 1.8733979368552673, + "grad_norm": 0.283203125, + "learning_rate": 0.00010917646410005532, + "loss": 1.4565, + "step": 6029 + }, + { + "epoch": 1.8737105345420444, + "grad_norm": 0.28515625, + "learning_rate": 0.00010915199692663827, + "loss": 1.4469, + "step": 6030 + }, + { + "epoch": 1.8740231322288214, + "grad_norm": 0.265625, + "learning_rate": 0.00010912752920070351, + "loss": 1.3675, + "step": 6031 + }, + { + "epoch": 1.8743357299155985, + "grad_norm": 0.283203125, + "learning_rate": 0.00010910306092372815, + "loss": 1.7303, + "step": 6032 + }, + { + "epoch": 1.8746483276023758, + "grad_norm": 0.275390625, + "learning_rate": 0.00010907859209718942, + "loss": 1.3998, + "step": 6033 + }, + { + "epoch": 1.8749609252891528, + "grad_norm": 0.291015625, + "learning_rate": 0.00010905412272256452, + "loss": 1.4973, + "step": 6034 + }, + { + "epoch": 1.87527352297593, + "grad_norm": 0.279296875, + "learning_rate": 0.00010902965280133068, + "loss": 1.5443, + "step": 6035 + }, + { + "epoch": 1.8755861206627071, + "grad_norm": 0.28125, + "learning_rate": 0.00010900518233496522, + "loss": 1.582, + "step": 6036 + }, + { + "epoch": 1.8758987183494842, + "grad_norm": 0.29296875, + "learning_rate": 0.00010898071132494543, + "loss": 1.5631, + "step": 6037 + }, + { + "epoch": 1.8762113160362612, + "grad_norm": 0.2734375, + "learning_rate": 0.00010895623977274863, + "loss": 1.2673, + "step": 6038 + }, + { + "epoch": 1.8765239137230383, + "grad_norm": 0.294921875, + "learning_rate": 0.00010893176767985223, + "loss": 1.6438, + "step": 6039 + }, + { + "epoch": 1.8768365114098156, + "grad_norm": 0.283203125, + "learning_rate": 0.0001089072950477336, + "loss": 1.4649, + "step": 6040 + }, + { + "epoch": 1.8771491090965928, + "grad_norm": 0.287109375, + "learning_rate": 0.00010888282187787026, + "loss": 1.6903, + "step": 6041 + }, + { + "epoch": 1.8774617067833699, + "grad_norm": 0.28125, + "learning_rate": 0.00010885834817173966, + "loss": 1.4262, + "step": 6042 + }, + { + "epoch": 1.877774304470147, + "grad_norm": 0.279296875, + "learning_rate": 0.00010883387393081928, + "loss": 1.4242, + "step": 6043 + }, + { + "epoch": 1.878086902156924, + "grad_norm": 0.291015625, + "learning_rate": 0.00010880939915658663, + "loss": 1.3279, + "step": 6044 + }, + { + "epoch": 1.878399499843701, + "grad_norm": 0.3046875, + "learning_rate": 0.00010878492385051937, + "loss": 1.785, + "step": 6045 + }, + { + "epoch": 1.8787120975304783, + "grad_norm": 0.294921875, + "learning_rate": 0.00010876044801409502, + "loss": 1.7062, + "step": 6046 + }, + { + "epoch": 1.8790246952172553, + "grad_norm": 0.28515625, + "learning_rate": 0.0001087359716487913, + "loss": 1.3882, + "step": 6047 + }, + { + "epoch": 1.8793372929040326, + "grad_norm": 0.279296875, + "learning_rate": 0.00010871149475608584, + "loss": 1.3782, + "step": 6048 + }, + { + "epoch": 1.8796498905908097, + "grad_norm": 0.28125, + "learning_rate": 0.00010868701733745634, + "loss": 1.8456, + "step": 6049 + }, + { + "epoch": 1.8799624882775867, + "grad_norm": 0.298828125, + "learning_rate": 0.00010866253939438049, + "loss": 1.6852, + "step": 6050 + }, + { + "epoch": 1.8802750859643638, + "grad_norm": 0.298828125, + "learning_rate": 0.00010863806092833615, + "loss": 1.7426, + "step": 6051 + }, + { + "epoch": 1.8805876836511408, + "grad_norm": 0.287109375, + "learning_rate": 0.00010861358194080102, + "loss": 1.3551, + "step": 6052 + }, + { + "epoch": 1.880900281337918, + "grad_norm": 0.283203125, + "learning_rate": 0.00010858910243325301, + "loss": 1.8453, + "step": 6053 + }, + { + "epoch": 1.8812128790246954, + "grad_norm": 0.275390625, + "learning_rate": 0.00010856462240716994, + "loss": 1.5246, + "step": 6054 + }, + { + "epoch": 1.8815254767114724, + "grad_norm": 0.291015625, + "learning_rate": 0.0001085401418640297, + "loss": 1.7768, + "step": 6055 + }, + { + "epoch": 1.8818380743982495, + "grad_norm": 0.27734375, + "learning_rate": 0.00010851566080531017, + "loss": 1.4684, + "step": 6056 + }, + { + "epoch": 1.8821506720850265, + "grad_norm": 0.3203125, + "learning_rate": 0.0001084911792324894, + "loss": 1.6255, + "step": 6057 + }, + { + "epoch": 1.8824632697718036, + "grad_norm": 0.291015625, + "learning_rate": 0.00010846669714704527, + "loss": 1.5263, + "step": 6058 + }, + { + "epoch": 1.8827758674585808, + "grad_norm": 0.3046875, + "learning_rate": 0.00010844221455045591, + "loss": 1.6249, + "step": 6059 + }, + { + "epoch": 1.8830884651453579, + "grad_norm": 0.2890625, + "learning_rate": 0.00010841773144419924, + "loss": 1.6932, + "step": 6060 + }, + { + "epoch": 1.8834010628321352, + "grad_norm": 0.302734375, + "learning_rate": 0.00010839324782975341, + "loss": 1.5831, + "step": 6061 + }, + { + "epoch": 1.8837136605189122, + "grad_norm": 0.28515625, + "learning_rate": 0.00010836876370859652, + "loss": 1.747, + "step": 6062 + }, + { + "epoch": 1.8840262582056893, + "grad_norm": 0.271484375, + "learning_rate": 0.0001083442790822067, + "loss": 1.7763, + "step": 6063 + }, + { + "epoch": 1.8843388558924663, + "grad_norm": 0.28515625, + "learning_rate": 0.00010831979395206211, + "loss": 1.7771, + "step": 6064 + }, + { + "epoch": 1.8846514535792434, + "grad_norm": 0.27734375, + "learning_rate": 0.00010829530831964098, + "loss": 1.6032, + "step": 6065 + }, + { + "epoch": 1.8849640512660206, + "grad_norm": 0.30859375, + "learning_rate": 0.00010827082218642149, + "loss": 1.6424, + "step": 6066 + }, + { + "epoch": 1.885276648952798, + "grad_norm": 0.28125, + "learning_rate": 0.00010824633555388196, + "loss": 1.6392, + "step": 6067 + }, + { + "epoch": 1.885589246639575, + "grad_norm": 0.291015625, + "learning_rate": 0.00010822184842350058, + "loss": 1.2719, + "step": 6068 + }, + { + "epoch": 1.885901844326352, + "grad_norm": 0.287109375, + "learning_rate": 0.00010819736079675577, + "loss": 1.6951, + "step": 6069 + }, + { + "epoch": 1.886214442013129, + "grad_norm": 0.287109375, + "learning_rate": 0.00010817287267512584, + "loss": 1.5512, + "step": 6070 + }, + { + "epoch": 1.886527039699906, + "grad_norm": 0.28515625, + "learning_rate": 0.0001081483840600892, + "loss": 1.4913, + "step": 6071 + }, + { + "epoch": 1.8868396373866834, + "grad_norm": 0.279296875, + "learning_rate": 0.0001081238949531242, + "loss": 1.4639, + "step": 6072 + }, + { + "epoch": 1.8871522350734604, + "grad_norm": 0.306640625, + "learning_rate": 0.00010809940535570932, + "loss": 1.6983, + "step": 6073 + }, + { + "epoch": 1.8874648327602377, + "grad_norm": 0.294921875, + "learning_rate": 0.000108074915269323, + "loss": 1.7414, + "step": 6074 + }, + { + "epoch": 1.8877774304470147, + "grad_norm": 0.306640625, + "learning_rate": 0.0001080504246954438, + "loss": 1.6636, + "step": 6075 + }, + { + "epoch": 1.8880900281337918, + "grad_norm": 0.298828125, + "learning_rate": 0.00010802593363555013, + "loss": 1.6001, + "step": 6076 + }, + { + "epoch": 1.8884026258205688, + "grad_norm": 0.29296875, + "learning_rate": 0.00010800144209112071, + "loss": 1.49, + "step": 6077 + }, + { + "epoch": 1.8887152235073459, + "grad_norm": 0.27734375, + "learning_rate": 0.00010797695006363398, + "loss": 1.5432, + "step": 6078 + }, + { + "epoch": 1.8890278211941232, + "grad_norm": 0.287109375, + "learning_rate": 0.00010795245755456861, + "loss": 1.5761, + "step": 6079 + }, + { + "epoch": 1.8893404188809004, + "grad_norm": 0.28515625, + "learning_rate": 0.00010792796456540324, + "loss": 1.4362, + "step": 6080 + }, + { + "epoch": 1.8896530165676775, + "grad_norm": 0.2890625, + "learning_rate": 0.00010790347109761656, + "loss": 1.4679, + "step": 6081 + }, + { + "epoch": 1.8899656142544545, + "grad_norm": 0.28125, + "learning_rate": 0.00010787897715268724, + "loss": 1.4464, + "step": 6082 + }, + { + "epoch": 1.8902782119412316, + "grad_norm": 0.28125, + "learning_rate": 0.00010785448273209406, + "loss": 1.5998, + "step": 6083 + }, + { + "epoch": 1.8905908096280086, + "grad_norm": 0.28515625, + "learning_rate": 0.00010782998783731573, + "loss": 1.6596, + "step": 6084 + }, + { + "epoch": 1.890903407314786, + "grad_norm": 0.2734375, + "learning_rate": 0.00010780549246983107, + "loss": 1.7341, + "step": 6085 + }, + { + "epoch": 1.891216005001563, + "grad_norm": 0.283203125, + "learning_rate": 0.00010778099663111885, + "loss": 1.4339, + "step": 6086 + }, + { + "epoch": 1.8915286026883402, + "grad_norm": 0.283203125, + "learning_rate": 0.00010775650032265798, + "loss": 1.5452, + "step": 6087 + }, + { + "epoch": 1.8918412003751173, + "grad_norm": 0.283203125, + "learning_rate": 0.00010773200354592727, + "loss": 1.4955, + "step": 6088 + }, + { + "epoch": 1.8921537980618943, + "grad_norm": 0.302734375, + "learning_rate": 0.0001077075063024057, + "loss": 1.4702, + "step": 6089 + }, + { + "epoch": 1.8924663957486714, + "grad_norm": 0.2890625, + "learning_rate": 0.00010768300859357212, + "loss": 1.3393, + "step": 6090 + }, + { + "epoch": 1.8927789934354484, + "grad_norm": 0.28125, + "learning_rate": 0.00010765851042090554, + "loss": 1.7447, + "step": 6091 + }, + { + "epoch": 1.8930915911222257, + "grad_norm": 0.28515625, + "learning_rate": 0.00010763401178588488, + "loss": 1.6784, + "step": 6092 + }, + { + "epoch": 1.893404188809003, + "grad_norm": 0.287109375, + "learning_rate": 0.00010760951268998925, + "loss": 1.6135, + "step": 6093 + }, + { + "epoch": 1.89371678649578, + "grad_norm": 0.287109375, + "learning_rate": 0.0001075850131346976, + "loss": 1.5082, + "step": 6094 + }, + { + "epoch": 1.894029384182557, + "grad_norm": 0.279296875, + "learning_rate": 0.0001075605131214891, + "loss": 1.462, + "step": 6095 + }, + { + "epoch": 1.8943419818693341, + "grad_norm": 0.287109375, + "learning_rate": 0.00010753601265184274, + "loss": 1.396, + "step": 6096 + }, + { + "epoch": 1.8946545795561112, + "grad_norm": 0.30078125, + "learning_rate": 0.00010751151172723773, + "loss": 1.6575, + "step": 6097 + }, + { + "epoch": 1.8949671772428884, + "grad_norm": 0.29296875, + "learning_rate": 0.00010748701034915314, + "loss": 1.7544, + "step": 6098 + }, + { + "epoch": 1.8952797749296655, + "grad_norm": 0.29296875, + "learning_rate": 0.00010746250851906823, + "loss": 1.3533, + "step": 6099 + }, + { + "epoch": 1.8955923726164428, + "grad_norm": 0.2890625, + "learning_rate": 0.00010743800623846214, + "loss": 1.9792, + "step": 6100 + }, + { + "epoch": 1.8959049703032198, + "grad_norm": 0.291015625, + "learning_rate": 0.00010741350350881419, + "loss": 1.4636, + "step": 6101 + }, + { + "epoch": 1.8962175679899969, + "grad_norm": 0.287109375, + "learning_rate": 0.00010738900033160353, + "loss": 1.6061, + "step": 6102 + }, + { + "epoch": 1.896530165676774, + "grad_norm": 0.298828125, + "learning_rate": 0.00010736449670830953, + "loss": 1.7217, + "step": 6103 + }, + { + "epoch": 1.896842763363551, + "grad_norm": 0.279296875, + "learning_rate": 0.00010733999264041146, + "loss": 1.708, + "step": 6104 + }, + { + "epoch": 1.8971553610503282, + "grad_norm": 0.267578125, + "learning_rate": 0.0001073154881293887, + "loss": 1.5846, + "step": 6105 + }, + { + "epoch": 1.8974679587371055, + "grad_norm": 0.2890625, + "learning_rate": 0.00010729098317672059, + "loss": 1.5484, + "step": 6106 + }, + { + "epoch": 1.8977805564238825, + "grad_norm": 0.287109375, + "learning_rate": 0.00010726647778388654, + "loss": 1.4086, + "step": 6107 + }, + { + "epoch": 1.8980931541106596, + "grad_norm": 0.283203125, + "learning_rate": 0.00010724197195236596, + "loss": 1.7811, + "step": 6108 + }, + { + "epoch": 1.8984057517974366, + "grad_norm": 0.287109375, + "learning_rate": 0.00010721746568363831, + "loss": 1.2625, + "step": 6109 + }, + { + "epoch": 1.8987183494842137, + "grad_norm": 0.27734375, + "learning_rate": 0.00010719295897918305, + "loss": 1.8101, + "step": 6110 + }, + { + "epoch": 1.899030947170991, + "grad_norm": 0.29296875, + "learning_rate": 0.0001071684518404797, + "loss": 1.4197, + "step": 6111 + }, + { + "epoch": 1.899343544857768, + "grad_norm": 0.28125, + "learning_rate": 0.00010714394426900778, + "loss": 1.3144, + "step": 6112 + }, + { + "epoch": 1.8996561425445453, + "grad_norm": 0.294921875, + "learning_rate": 0.00010711943626624686, + "loss": 1.5552, + "step": 6113 + }, + { + "epoch": 1.8999687402313223, + "grad_norm": 0.294921875, + "learning_rate": 0.00010709492783367645, + "loss": 1.5563, + "step": 6114 + }, + { + "epoch": 1.9002813379180994, + "grad_norm": 0.28515625, + "learning_rate": 0.00010707041897277623, + "loss": 1.6451, + "step": 6115 + }, + { + "epoch": 1.9005939356048764, + "grad_norm": 0.294921875, + "learning_rate": 0.00010704590968502581, + "loss": 1.5293, + "step": 6116 + }, + { + "epoch": 1.9009065332916535, + "grad_norm": 0.29296875, + "learning_rate": 0.00010702139997190483, + "loss": 1.4838, + "step": 6117 + }, + { + "epoch": 1.9012191309784308, + "grad_norm": 0.2734375, + "learning_rate": 0.00010699688983489302, + "loss": 1.5037, + "step": 6118 + }, + { + "epoch": 1.901531728665208, + "grad_norm": 0.2890625, + "learning_rate": 0.00010697237927547003, + "loss": 1.3853, + "step": 6119 + }, + { + "epoch": 1.901844326351985, + "grad_norm": 0.27734375, + "learning_rate": 0.00010694786829511562, + "loss": 1.3902, + "step": 6120 + }, + { + "epoch": 1.9021569240387621, + "grad_norm": 0.30859375, + "learning_rate": 0.00010692335689530955, + "loss": 1.5142, + "step": 6121 + }, + { + "epoch": 1.9024695217255392, + "grad_norm": 0.30078125, + "learning_rate": 0.00010689884507753159, + "loss": 1.6241, + "step": 6122 + }, + { + "epoch": 1.9027821194123162, + "grad_norm": 0.294921875, + "learning_rate": 0.00010687433284326159, + "loss": 1.451, + "step": 6123 + }, + { + "epoch": 1.9030947170990935, + "grad_norm": 0.2890625, + "learning_rate": 0.00010684982019397934, + "loss": 1.3795, + "step": 6124 + }, + { + "epoch": 1.9034073147858706, + "grad_norm": 0.28515625, + "learning_rate": 0.00010682530713116472, + "loss": 1.4519, + "step": 6125 + }, + { + "epoch": 1.9037199124726478, + "grad_norm": 0.2734375, + "learning_rate": 0.00010680079365629758, + "loss": 1.6463, + "step": 6126 + }, + { + "epoch": 1.9040325101594249, + "grad_norm": 0.283203125, + "learning_rate": 0.00010677627977085788, + "loss": 1.305, + "step": 6127 + }, + { + "epoch": 1.904345107846202, + "grad_norm": 0.2890625, + "learning_rate": 0.00010675176547632555, + "loss": 1.6685, + "step": 6128 + }, + { + "epoch": 1.904657705532979, + "grad_norm": 0.2890625, + "learning_rate": 0.00010672725077418054, + "loss": 1.8257, + "step": 6129 + }, + { + "epoch": 1.904970303219756, + "grad_norm": 0.287109375, + "learning_rate": 0.00010670273566590281, + "loss": 1.4169, + "step": 6130 + }, + { + "epoch": 1.9052829009065333, + "grad_norm": 0.28515625, + "learning_rate": 0.0001066782201529724, + "loss": 1.7248, + "step": 6131 + }, + { + "epoch": 1.9055954985933106, + "grad_norm": 0.28125, + "learning_rate": 0.00010665370423686931, + "loss": 1.6358, + "step": 6132 + }, + { + "epoch": 1.9059080962800876, + "grad_norm": 0.27734375, + "learning_rate": 0.00010662918791907364, + "loss": 1.3639, + "step": 6133 + }, + { + "epoch": 1.9062206939668647, + "grad_norm": 0.298828125, + "learning_rate": 0.00010660467120106541, + "loss": 1.4551, + "step": 6134 + }, + { + "epoch": 1.9065332916536417, + "grad_norm": 0.283203125, + "learning_rate": 0.00010658015408432478, + "loss": 1.7171, + "step": 6135 + }, + { + "epoch": 1.9068458893404188, + "grad_norm": 0.287109375, + "learning_rate": 0.00010655563657033187, + "loss": 1.6136, + "step": 6136 + }, + { + "epoch": 1.907158487027196, + "grad_norm": 0.3046875, + "learning_rate": 0.00010653111866056685, + "loss": 1.5219, + "step": 6137 + }, + { + "epoch": 1.907471084713973, + "grad_norm": 0.30078125, + "learning_rate": 0.00010650660035650984, + "loss": 1.3852, + "step": 6138 + }, + { + "epoch": 1.9077836824007504, + "grad_norm": 0.283203125, + "learning_rate": 0.00010648208165964109, + "loss": 1.6926, + "step": 6139 + }, + { + "epoch": 1.9080962800875274, + "grad_norm": 0.275390625, + "learning_rate": 0.0001064575625714408, + "loss": 1.4854, + "step": 6140 + }, + { + "epoch": 1.9084088777743045, + "grad_norm": 0.294921875, + "learning_rate": 0.00010643304309338921, + "loss": 1.4488, + "step": 6141 + }, + { + "epoch": 1.9087214754610815, + "grad_norm": 0.2890625, + "learning_rate": 0.00010640852322696666, + "loss": 1.7811, + "step": 6142 + }, + { + "epoch": 1.9090340731478586, + "grad_norm": 0.27734375, + "learning_rate": 0.00010638400297365336, + "loss": 1.506, + "step": 6143 + }, + { + "epoch": 1.9093466708346358, + "grad_norm": 0.357421875, + "learning_rate": 0.00010635948233492968, + "loss": 1.994, + "step": 6144 + }, + { + "epoch": 1.909659268521413, + "grad_norm": 0.28125, + "learning_rate": 0.00010633496131227593, + "loss": 1.4202, + "step": 6145 + }, + { + "epoch": 1.9099718662081901, + "grad_norm": 0.283203125, + "learning_rate": 0.00010631043990717251, + "loss": 1.2593, + "step": 6146 + }, + { + "epoch": 1.9102844638949672, + "grad_norm": 0.29296875, + "learning_rate": 0.00010628591812109978, + "loss": 1.5465, + "step": 6147 + }, + { + "epoch": 1.9105970615817442, + "grad_norm": 0.291015625, + "learning_rate": 0.00010626139595553819, + "loss": 1.4455, + "step": 6148 + }, + { + "epoch": 1.9109096592685213, + "grad_norm": 0.28515625, + "learning_rate": 0.00010623687341196813, + "loss": 1.5223, + "step": 6149 + }, + { + "epoch": 1.9112222569552986, + "grad_norm": 0.28515625, + "learning_rate": 0.00010621235049187006, + "loss": 1.5872, + "step": 6150 + }, + { + "epoch": 1.9115348546420756, + "grad_norm": 0.283203125, + "learning_rate": 0.0001061878271967245, + "loss": 1.5521, + "step": 6151 + }, + { + "epoch": 1.911847452328853, + "grad_norm": 0.275390625, + "learning_rate": 0.0001061633035280119, + "loss": 1.3491, + "step": 6152 + }, + { + "epoch": 1.91216005001563, + "grad_norm": 0.296875, + "learning_rate": 0.00010613877948721282, + "loss": 1.84, + "step": 6153 + }, + { + "epoch": 1.912472647702407, + "grad_norm": 0.283203125, + "learning_rate": 0.00010611425507580781, + "loss": 1.7756, + "step": 6154 + }, + { + "epoch": 1.912785245389184, + "grad_norm": 0.28125, + "learning_rate": 0.00010608973029527742, + "loss": 1.4592, + "step": 6155 + }, + { + "epoch": 1.913097843075961, + "grad_norm": 0.271484375, + "learning_rate": 0.00010606520514710225, + "loss": 1.4681, + "step": 6156 + }, + { + "epoch": 1.9134104407627384, + "grad_norm": 0.28515625, + "learning_rate": 0.00010604067963276294, + "loss": 1.2755, + "step": 6157 + }, + { + "epoch": 1.9137230384495156, + "grad_norm": 0.271484375, + "learning_rate": 0.00010601615375374006, + "loss": 1.4351, + "step": 6158 + }, + { + "epoch": 1.9140356361362927, + "grad_norm": 0.271484375, + "learning_rate": 0.00010599162751151437, + "loss": 1.3687, + "step": 6159 + }, + { + "epoch": 1.9143482338230697, + "grad_norm": 0.310546875, + "learning_rate": 0.00010596710090756643, + "loss": 1.8149, + "step": 6160 + }, + { + "epoch": 1.9146608315098468, + "grad_norm": 0.279296875, + "learning_rate": 0.00010594257394337706, + "loss": 1.5358, + "step": 6161 + }, + { + "epoch": 1.9149734291966238, + "grad_norm": 0.296875, + "learning_rate": 0.00010591804662042688, + "loss": 1.851, + "step": 6162 + }, + { + "epoch": 1.915286026883401, + "grad_norm": 0.29296875, + "learning_rate": 0.0001058935189401967, + "loss": 1.5856, + "step": 6163 + }, + { + "epoch": 1.9155986245701782, + "grad_norm": 0.298828125, + "learning_rate": 0.00010586899090416727, + "loss": 1.681, + "step": 6164 + }, + { + "epoch": 1.9159112222569554, + "grad_norm": 0.283203125, + "learning_rate": 0.00010584446251381941, + "loss": 1.5954, + "step": 6165 + }, + { + "epoch": 1.9162238199437325, + "grad_norm": 0.291015625, + "learning_rate": 0.00010581993377063387, + "loss": 1.2033, + "step": 6166 + }, + { + "epoch": 1.9165364176305095, + "grad_norm": 0.2890625, + "learning_rate": 0.00010579540467609153, + "loss": 1.549, + "step": 6167 + }, + { + "epoch": 1.9168490153172866, + "grad_norm": 0.26953125, + "learning_rate": 0.0001057708752316732, + "loss": 1.6916, + "step": 6168 + }, + { + "epoch": 1.9171616130040636, + "grad_norm": 0.27734375, + "learning_rate": 0.0001057463454388598, + "loss": 1.5429, + "step": 6169 + }, + { + "epoch": 1.917474210690841, + "grad_norm": 0.28515625, + "learning_rate": 0.00010572181529913216, + "loss": 1.7526, + "step": 6170 + }, + { + "epoch": 1.9177868083776182, + "grad_norm": 0.28515625, + "learning_rate": 0.00010569728481397132, + "loss": 1.5457, + "step": 6171 + }, + { + "epoch": 1.9180994060643952, + "grad_norm": 0.29296875, + "learning_rate": 0.00010567275398485807, + "loss": 2.0223, + "step": 6172 + }, + { + "epoch": 1.9184120037511723, + "grad_norm": 0.2890625, + "learning_rate": 0.00010564822281327346, + "loss": 1.2732, + "step": 6173 + }, + { + "epoch": 1.9187246014379493, + "grad_norm": 0.3046875, + "learning_rate": 0.00010562369130069842, + "loss": 1.4948, + "step": 6174 + }, + { + "epoch": 1.9190371991247264, + "grad_norm": 0.27734375, + "learning_rate": 0.00010559915944861398, + "loss": 1.5727, + "step": 6175 + }, + { + "epoch": 1.9193497968115036, + "grad_norm": 0.28515625, + "learning_rate": 0.00010557462725850114, + "loss": 1.6334, + "step": 6176 + }, + { + "epoch": 1.9196623944982807, + "grad_norm": 0.29296875, + "learning_rate": 0.00010555009473184096, + "loss": 1.3827, + "step": 6177 + }, + { + "epoch": 1.919974992185058, + "grad_norm": 0.287109375, + "learning_rate": 0.00010552556187011451, + "loss": 1.3521, + "step": 6178 + }, + { + "epoch": 1.920287589871835, + "grad_norm": 0.296875, + "learning_rate": 0.00010550102867480283, + "loss": 1.5802, + "step": 6179 + }, + { + "epoch": 1.920600187558612, + "grad_norm": 0.27734375, + "learning_rate": 0.00010547649514738702, + "loss": 1.5329, + "step": 6180 + }, + { + "epoch": 1.920912785245389, + "grad_norm": 0.279296875, + "learning_rate": 0.00010545196128934824, + "loss": 1.7083, + "step": 6181 + }, + { + "epoch": 1.9212253829321662, + "grad_norm": 0.28515625, + "learning_rate": 0.0001054274271021676, + "loss": 1.4001, + "step": 6182 + }, + { + "epoch": 1.9215379806189434, + "grad_norm": 0.287109375, + "learning_rate": 0.00010540289258732627, + "loss": 1.3963, + "step": 6183 + }, + { + "epoch": 1.9218505783057207, + "grad_norm": 0.298828125, + "learning_rate": 0.00010537835774630547, + "loss": 1.6051, + "step": 6184 + }, + { + "epoch": 1.9221631759924978, + "grad_norm": 0.294921875, + "learning_rate": 0.00010535382258058632, + "loss": 1.5225, + "step": 6185 + }, + { + "epoch": 1.9224757736792748, + "grad_norm": 0.296875, + "learning_rate": 0.00010532928709165006, + "loss": 1.3148, + "step": 6186 + }, + { + "epoch": 1.9227883713660519, + "grad_norm": 0.2734375, + "learning_rate": 0.00010530475128097799, + "loss": 1.4773, + "step": 6187 + }, + { + "epoch": 1.923100969052829, + "grad_norm": 0.28515625, + "learning_rate": 0.0001052802151500513, + "loss": 1.4304, + "step": 6188 + }, + { + "epoch": 1.9234135667396062, + "grad_norm": 0.29296875, + "learning_rate": 0.00010525567870035129, + "loss": 1.6024, + "step": 6189 + }, + { + "epoch": 1.9237261644263832, + "grad_norm": 0.279296875, + "learning_rate": 0.00010523114193335928, + "loss": 1.57, + "step": 6190 + }, + { + "epoch": 1.9240387621131605, + "grad_norm": 0.275390625, + "learning_rate": 0.00010520660485055656, + "loss": 1.622, + "step": 6191 + }, + { + "epoch": 1.9243513597999375, + "grad_norm": 0.294921875, + "learning_rate": 0.00010518206745342445, + "loss": 1.465, + "step": 6192 + }, + { + "epoch": 1.9246639574867146, + "grad_norm": 0.27734375, + "learning_rate": 0.00010515752974344432, + "loss": 1.4871, + "step": 6193 + }, + { + "epoch": 1.9249765551734916, + "grad_norm": 0.302734375, + "learning_rate": 0.00010513299172209756, + "loss": 1.5585, + "step": 6194 + }, + { + "epoch": 1.9252891528602687, + "grad_norm": 0.30859375, + "learning_rate": 0.00010510845339086557, + "loss": 1.7355, + "step": 6195 + }, + { + "epoch": 1.925601750547046, + "grad_norm": 0.283203125, + "learning_rate": 0.00010508391475122972, + "loss": 1.6971, + "step": 6196 + }, + { + "epoch": 1.925914348233823, + "grad_norm": 0.3046875, + "learning_rate": 0.00010505937580467146, + "loss": 1.394, + "step": 6197 + }, + { + "epoch": 1.9262269459206003, + "grad_norm": 0.302734375, + "learning_rate": 0.00010503483655267224, + "loss": 1.9124, + "step": 6198 + }, + { + "epoch": 1.9265395436073773, + "grad_norm": 0.30078125, + "learning_rate": 0.00010501029699671352, + "loss": 1.6373, + "step": 6199 + }, + { + "epoch": 1.9268521412941544, + "grad_norm": 0.29296875, + "learning_rate": 0.00010498575713827677, + "loss": 1.6814, + "step": 6200 + }, + { + "epoch": 1.9271647389809314, + "grad_norm": 0.302734375, + "learning_rate": 0.00010496121697884352, + "loss": 1.6828, + "step": 6201 + }, + { + "epoch": 1.9274773366677087, + "grad_norm": 0.296875, + "learning_rate": 0.00010493667651989529, + "loss": 1.5589, + "step": 6202 + }, + { + "epoch": 1.9277899343544858, + "grad_norm": 0.28125, + "learning_rate": 0.0001049121357629136, + "loss": 1.6377, + "step": 6203 + }, + { + "epoch": 1.928102532041263, + "grad_norm": 0.291015625, + "learning_rate": 0.00010488759470937998, + "loss": 1.3152, + "step": 6204 + }, + { + "epoch": 1.92841512972804, + "grad_norm": 0.3046875, + "learning_rate": 0.00010486305336077609, + "loss": 1.4533, + "step": 6205 + }, + { + "epoch": 1.9287277274148171, + "grad_norm": 0.298828125, + "learning_rate": 0.00010483851171858346, + "loss": 1.4259, + "step": 6206 + }, + { + "epoch": 1.9290403251015942, + "grad_norm": 0.275390625, + "learning_rate": 0.00010481396978428368, + "loss": 1.5113, + "step": 6207 + }, + { + "epoch": 1.9293529227883712, + "grad_norm": 0.279296875, + "learning_rate": 0.00010478942755935846, + "loss": 1.7184, + "step": 6208 + }, + { + "epoch": 1.9296655204751485, + "grad_norm": 0.294921875, + "learning_rate": 0.00010476488504528936, + "loss": 1.3053, + "step": 6209 + }, + { + "epoch": 1.9299781181619255, + "grad_norm": 0.296875, + "learning_rate": 0.00010474034224355808, + "loss": 1.5313, + "step": 6210 + }, + { + "epoch": 1.9302907158487028, + "grad_norm": 0.294921875, + "learning_rate": 0.00010471579915564631, + "loss": 1.5783, + "step": 6211 + }, + { + "epoch": 1.9306033135354799, + "grad_norm": 0.29296875, + "learning_rate": 0.00010469125578303573, + "loss": 1.4423, + "step": 6212 + }, + { + "epoch": 1.930915911222257, + "grad_norm": 0.28125, + "learning_rate": 0.00010466671212720805, + "loss": 1.7387, + "step": 6213 + }, + { + "epoch": 1.931228508909034, + "grad_norm": 0.296875, + "learning_rate": 0.00010464216818964502, + "loss": 1.6482, + "step": 6214 + }, + { + "epoch": 1.9315411065958112, + "grad_norm": 0.28125, + "learning_rate": 0.00010461762397182837, + "loss": 1.6284, + "step": 6215 + }, + { + "epoch": 1.9318537042825883, + "grad_norm": 0.298828125, + "learning_rate": 0.00010459307947523991, + "loss": 1.5385, + "step": 6216 + }, + { + "epoch": 1.9321663019693656, + "grad_norm": 0.30078125, + "learning_rate": 0.00010456853470136136, + "loss": 1.7142, + "step": 6217 + }, + { + "epoch": 1.9324788996561426, + "grad_norm": 0.291015625, + "learning_rate": 0.00010454398965167458, + "loss": 1.5259, + "step": 6218 + }, + { + "epoch": 1.9327914973429197, + "grad_norm": 0.283203125, + "learning_rate": 0.00010451944432766131, + "loss": 1.5003, + "step": 6219 + }, + { + "epoch": 1.9331040950296967, + "grad_norm": 0.291015625, + "learning_rate": 0.00010449489873080345, + "loss": 1.3931, + "step": 6220 + }, + { + "epoch": 1.9334166927164738, + "grad_norm": 0.287109375, + "learning_rate": 0.00010447035286258282, + "loss": 1.4621, + "step": 6221 + }, + { + "epoch": 1.933729290403251, + "grad_norm": 0.283203125, + "learning_rate": 0.0001044458067244813, + "loss": 1.3083, + "step": 6222 + }, + { + "epoch": 1.934041888090028, + "grad_norm": 0.294921875, + "learning_rate": 0.00010442126031798076, + "loss": 1.7117, + "step": 6223 + }, + { + "epoch": 1.9343544857768054, + "grad_norm": 0.373046875, + "learning_rate": 0.00010439671364456312, + "loss": 2.2069, + "step": 6224 + }, + { + "epoch": 1.9346670834635824, + "grad_norm": 0.2890625, + "learning_rate": 0.00010437216670571021, + "loss": 1.3417, + "step": 6225 + }, + { + "epoch": 1.9349796811503595, + "grad_norm": 0.29296875, + "learning_rate": 0.00010434761950290408, + "loss": 1.4831, + "step": 6226 + }, + { + "epoch": 1.9352922788371365, + "grad_norm": 0.322265625, + "learning_rate": 0.0001043230720376266, + "loss": 1.6731, + "step": 6227 + }, + { + "epoch": 1.9356048765239138, + "grad_norm": 0.296875, + "learning_rate": 0.00010429852431135976, + "loss": 1.6335, + "step": 6228 + }, + { + "epoch": 1.9359174742106908, + "grad_norm": 0.28125, + "learning_rate": 0.00010427397632558556, + "loss": 1.5968, + "step": 6229 + }, + { + "epoch": 1.936230071897468, + "grad_norm": 0.291015625, + "learning_rate": 0.00010424942808178593, + "loss": 1.3624, + "step": 6230 + }, + { + "epoch": 1.9365426695842451, + "grad_norm": 0.2734375, + "learning_rate": 0.00010422487958144289, + "loss": 1.5697, + "step": 6231 + }, + { + "epoch": 1.9368552672710222, + "grad_norm": 0.291015625, + "learning_rate": 0.0001042003308260385, + "loss": 1.4961, + "step": 6232 + }, + { + "epoch": 1.9371678649577992, + "grad_norm": 0.2890625, + "learning_rate": 0.0001041757818170548, + "loss": 1.7063, + "step": 6233 + }, + { + "epoch": 1.9374804626445763, + "grad_norm": 0.30078125, + "learning_rate": 0.00010415123255597383, + "loss": 1.6431, + "step": 6234 + }, + { + "epoch": 1.9377930603313536, + "grad_norm": 0.294921875, + "learning_rate": 0.00010412668304427766, + "loss": 1.6485, + "step": 6235 + }, + { + "epoch": 1.9381056580181306, + "grad_norm": 0.30078125, + "learning_rate": 0.00010410213328344838, + "loss": 1.4691, + "step": 6236 + }, + { + "epoch": 1.9384182557049079, + "grad_norm": 0.3046875, + "learning_rate": 0.00010407758327496807, + "loss": 1.5958, + "step": 6237 + }, + { + "epoch": 1.938730853391685, + "grad_norm": 0.28515625, + "learning_rate": 0.00010405303302031888, + "loss": 1.3413, + "step": 6238 + }, + { + "epoch": 1.939043451078462, + "grad_norm": 0.27734375, + "learning_rate": 0.0001040284825209829, + "loss": 1.2792, + "step": 6239 + }, + { + "epoch": 1.939356048765239, + "grad_norm": 0.294921875, + "learning_rate": 0.0001040039317784423, + "loss": 1.3078, + "step": 6240 + }, + { + "epoch": 1.9396686464520163, + "grad_norm": 0.2890625, + "learning_rate": 0.00010397938079417926, + "loss": 1.7102, + "step": 6241 + }, + { + "epoch": 1.9399812441387934, + "grad_norm": 0.2890625, + "learning_rate": 0.00010395482956967592, + "loss": 1.5055, + "step": 6242 + }, + { + "epoch": 1.9402938418255706, + "grad_norm": 0.291015625, + "learning_rate": 0.00010393027810641445, + "loss": 1.7202, + "step": 6243 + }, + { + "epoch": 1.9406064395123477, + "grad_norm": 0.296875, + "learning_rate": 0.00010390572640587713, + "loss": 1.5117, + "step": 6244 + }, + { + "epoch": 1.9409190371991247, + "grad_norm": 0.291015625, + "learning_rate": 0.00010388117446954609, + "loss": 1.581, + "step": 6245 + }, + { + "epoch": 1.9412316348859018, + "grad_norm": 0.28515625, + "learning_rate": 0.0001038566222989036, + "loss": 1.7444, + "step": 6246 + }, + { + "epoch": 1.9415442325726788, + "grad_norm": 0.28515625, + "learning_rate": 0.00010383206989543195, + "loss": 1.4812, + "step": 6247 + }, + { + "epoch": 1.941856830259456, + "grad_norm": 0.296875, + "learning_rate": 0.00010380751726061333, + "loss": 1.6901, + "step": 6248 + }, + { + "epoch": 1.9421694279462332, + "grad_norm": 0.283203125, + "learning_rate": 0.00010378296439593002, + "loss": 1.5478, + "step": 6249 + }, + { + "epoch": 1.9424820256330104, + "grad_norm": 0.29296875, + "learning_rate": 0.00010375841130286437, + "loss": 1.305, + "step": 6250 + }, + { + "epoch": 1.9427946233197875, + "grad_norm": 0.28515625, + "learning_rate": 0.00010373385798289861, + "loss": 1.6149, + "step": 6251 + }, + { + "epoch": 1.9431072210065645, + "grad_norm": 0.298828125, + "learning_rate": 0.0001037093044375151, + "loss": 1.5205, + "step": 6252 + }, + { + "epoch": 1.9434198186933416, + "grad_norm": 0.28515625, + "learning_rate": 0.00010368475066819613, + "loss": 1.4071, + "step": 6253 + }, + { + "epoch": 1.9437324163801186, + "grad_norm": 0.306640625, + "learning_rate": 0.00010366019667642412, + "loss": 1.4452, + "step": 6254 + }, + { + "epoch": 1.944045014066896, + "grad_norm": 0.287109375, + "learning_rate": 0.00010363564246368134, + "loss": 1.4957, + "step": 6255 + }, + { + "epoch": 1.9443576117536732, + "grad_norm": 0.2890625, + "learning_rate": 0.00010361108803145019, + "loss": 1.6569, + "step": 6256 + }, + { + "epoch": 1.9446702094404502, + "grad_norm": 0.28125, + "learning_rate": 0.00010358653338121305, + "loss": 1.4824, + "step": 6257 + }, + { + "epoch": 1.9449828071272273, + "grad_norm": 0.29296875, + "learning_rate": 0.00010356197851445233, + "loss": 1.4794, + "step": 6258 + }, + { + "epoch": 1.9452954048140043, + "grad_norm": 0.29296875, + "learning_rate": 0.00010353742343265043, + "loss": 1.7687, + "step": 6259 + }, + { + "epoch": 1.9456080025007814, + "grad_norm": 0.314453125, + "learning_rate": 0.00010351286813728978, + "loss": 1.3405, + "step": 6260 + }, + { + "epoch": 1.9459206001875586, + "grad_norm": 0.29296875, + "learning_rate": 0.00010348831262985277, + "loss": 1.4164, + "step": 6261 + }, + { + "epoch": 1.9462331978743357, + "grad_norm": 0.302734375, + "learning_rate": 0.00010346375691182191, + "loss": 1.5927, + "step": 6262 + }, + { + "epoch": 1.946545795561113, + "grad_norm": 0.28515625, + "learning_rate": 0.00010343920098467958, + "loss": 1.7809, + "step": 6263 + }, + { + "epoch": 1.94685839324789, + "grad_norm": 0.33203125, + "learning_rate": 0.00010341464484990837, + "loss": 1.4906, + "step": 6264 + }, + { + "epoch": 1.947170990934667, + "grad_norm": 0.306640625, + "learning_rate": 0.00010339008850899068, + "loss": 1.5188, + "step": 6265 + }, + { + "epoch": 1.947483588621444, + "grad_norm": 0.28515625, + "learning_rate": 0.00010336553196340902, + "loss": 1.3992, + "step": 6266 + }, + { + "epoch": 1.9477961863082212, + "grad_norm": 0.291015625, + "learning_rate": 0.00010334097521464589, + "loss": 1.4773, + "step": 6267 + }, + { + "epoch": 1.9481087839949984, + "grad_norm": 0.271484375, + "learning_rate": 0.00010331641826418385, + "loss": 1.3414, + "step": 6268 + }, + { + "epoch": 1.9484213816817757, + "grad_norm": 0.29296875, + "learning_rate": 0.0001032918611135054, + "loss": 1.617, + "step": 6269 + }, + { + "epoch": 1.9487339793685527, + "grad_norm": 0.294921875, + "learning_rate": 0.0001032673037640931, + "loss": 1.2843, + "step": 6270 + }, + { + "epoch": 1.9490465770553298, + "grad_norm": 0.2890625, + "learning_rate": 0.00010324274621742953, + "loss": 1.5678, + "step": 6271 + }, + { + "epoch": 1.9493591747421068, + "grad_norm": 0.2734375, + "learning_rate": 0.00010321818847499725, + "loss": 1.3233, + "step": 6272 + }, + { + "epoch": 1.949671772428884, + "grad_norm": 0.30078125, + "learning_rate": 0.00010319363053827878, + "loss": 1.3855, + "step": 6273 + }, + { + "epoch": 1.9499843701156612, + "grad_norm": 0.29296875, + "learning_rate": 0.0001031690724087568, + "loss": 1.5765, + "step": 6274 + }, + { + "epoch": 1.9502969678024382, + "grad_norm": 0.30078125, + "learning_rate": 0.00010314451408791385, + "loss": 1.3791, + "step": 6275 + }, + { + "epoch": 1.9506095654892155, + "grad_norm": 0.267578125, + "learning_rate": 0.00010311995557723262, + "loss": 1.3775, + "step": 6276 + }, + { + "epoch": 1.9509221631759925, + "grad_norm": 0.296875, + "learning_rate": 0.00010309539687819567, + "loss": 1.611, + "step": 6277 + }, + { + "epoch": 1.9512347608627696, + "grad_norm": 0.29296875, + "learning_rate": 0.00010307083799228567, + "loss": 1.4502, + "step": 6278 + }, + { + "epoch": 1.9515473585495466, + "grad_norm": 0.298828125, + "learning_rate": 0.00010304627892098526, + "loss": 1.521, + "step": 6279 + }, + { + "epoch": 1.9518599562363237, + "grad_norm": 0.302734375, + "learning_rate": 0.00010302171966577711, + "loss": 1.5794, + "step": 6280 + }, + { + "epoch": 1.952172553923101, + "grad_norm": 0.296875, + "learning_rate": 0.0001029971602281439, + "loss": 1.5355, + "step": 6281 + }, + { + "epoch": 1.9524851516098782, + "grad_norm": 0.29296875, + "learning_rate": 0.00010297260060956831, + "loss": 1.3984, + "step": 6282 + }, + { + "epoch": 1.9527977492966553, + "grad_norm": 0.294921875, + "learning_rate": 0.00010294804081153304, + "loss": 1.3863, + "step": 6283 + }, + { + "epoch": 1.9531103469834323, + "grad_norm": 0.28515625, + "learning_rate": 0.00010292348083552079, + "loss": 1.571, + "step": 6284 + }, + { + "epoch": 1.9534229446702094, + "grad_norm": 0.298828125, + "learning_rate": 0.00010289892068301426, + "loss": 1.3656, + "step": 6285 + }, + { + "epoch": 1.9537355423569864, + "grad_norm": 0.291015625, + "learning_rate": 0.00010287436035549621, + "loss": 1.8751, + "step": 6286 + }, + { + "epoch": 1.9540481400437637, + "grad_norm": 0.30078125, + "learning_rate": 0.00010284979985444933, + "loss": 1.3707, + "step": 6287 + }, + { + "epoch": 1.9543607377305408, + "grad_norm": 0.271484375, + "learning_rate": 0.00010282523918135642, + "loss": 1.4945, + "step": 6288 + }, + { + "epoch": 1.954673335417318, + "grad_norm": 0.29296875, + "learning_rate": 0.00010280067833770024, + "loss": 1.4987, + "step": 6289 + }, + { + "epoch": 1.954985933104095, + "grad_norm": 0.287109375, + "learning_rate": 0.00010277611732496353, + "loss": 1.461, + "step": 6290 + }, + { + "epoch": 1.9552985307908721, + "grad_norm": 0.302734375, + "learning_rate": 0.00010275155614462905, + "loss": 1.6741, + "step": 6291 + }, + { + "epoch": 1.9556111284776492, + "grad_norm": 0.28515625, + "learning_rate": 0.00010272699479817967, + "loss": 1.4173, + "step": 6292 + }, + { + "epoch": 1.9559237261644262, + "grad_norm": 0.298828125, + "learning_rate": 0.0001027024332870981, + "loss": 1.8938, + "step": 6293 + }, + { + "epoch": 1.9562363238512035, + "grad_norm": 0.279296875, + "learning_rate": 0.00010267787161286719, + "loss": 1.5966, + "step": 6294 + }, + { + "epoch": 1.9565489215379808, + "grad_norm": 0.291015625, + "learning_rate": 0.00010265330977696979, + "loss": 1.4133, + "step": 6295 + }, + { + "epoch": 1.9568615192247578, + "grad_norm": 0.279296875, + "learning_rate": 0.00010262874778088869, + "loss": 1.3799, + "step": 6296 + }, + { + "epoch": 1.9571741169115349, + "grad_norm": 0.310546875, + "learning_rate": 0.0001026041856261067, + "loss": 1.4744, + "step": 6297 + }, + { + "epoch": 1.957486714598312, + "grad_norm": 0.287109375, + "learning_rate": 0.00010257962331410673, + "loss": 1.3745, + "step": 6298 + }, + { + "epoch": 1.957799312285089, + "grad_norm": 0.287109375, + "learning_rate": 0.00010255506084637161, + "loss": 1.5977, + "step": 6299 + }, + { + "epoch": 1.9581119099718662, + "grad_norm": 0.296875, + "learning_rate": 0.0001025304982243842, + "loss": 1.4493, + "step": 6300 + }, + { + "epoch": 1.9584245076586433, + "grad_norm": 0.298828125, + "learning_rate": 0.00010250593544962744, + "loss": 1.6393, + "step": 6301 + }, + { + "epoch": 1.9587371053454206, + "grad_norm": 0.279296875, + "learning_rate": 0.00010248137252358412, + "loss": 1.7133, + "step": 6302 + }, + { + "epoch": 1.9590497030321976, + "grad_norm": 0.2890625, + "learning_rate": 0.00010245680944773717, + "loss": 1.4606, + "step": 6303 + }, + { + "epoch": 1.9593623007189747, + "grad_norm": 0.275390625, + "learning_rate": 0.00010243224622356951, + "loss": 1.3692, + "step": 6304 + }, + { + "epoch": 1.9596748984057517, + "grad_norm": 0.302734375, + "learning_rate": 0.00010240768285256404, + "loss": 1.5115, + "step": 6305 + }, + { + "epoch": 1.9599874960925288, + "grad_norm": 0.294921875, + "learning_rate": 0.00010238311933620373, + "loss": 1.3411, + "step": 6306 + }, + { + "epoch": 1.960300093779306, + "grad_norm": 0.283203125, + "learning_rate": 0.00010235855567597143, + "loss": 1.7619, + "step": 6307 + }, + { + "epoch": 1.9606126914660833, + "grad_norm": 0.27734375, + "learning_rate": 0.00010233399187335013, + "loss": 1.4151, + "step": 6308 + }, + { + "epoch": 1.9609252891528604, + "grad_norm": 0.27734375, + "learning_rate": 0.00010230942792982275, + "loss": 1.4594, + "step": 6309 + }, + { + "epoch": 1.9612378868396374, + "grad_norm": 0.30078125, + "learning_rate": 0.00010228486384687227, + "loss": 1.3581, + "step": 6310 + }, + { + "epoch": 1.9615504845264145, + "grad_norm": 0.28125, + "learning_rate": 0.00010226029962598165, + "loss": 1.5394, + "step": 6311 + }, + { + "epoch": 1.9618630822131915, + "grad_norm": 0.275390625, + "learning_rate": 0.0001022357352686339, + "loss": 1.7881, + "step": 6312 + }, + { + "epoch": 1.9621756798999688, + "grad_norm": 0.294921875, + "learning_rate": 0.0001022111707763119, + "loss": 1.352, + "step": 6313 + }, + { + "epoch": 1.9624882775867458, + "grad_norm": 0.287109375, + "learning_rate": 0.00010218660615049876, + "loss": 1.5106, + "step": 6314 + }, + { + "epoch": 1.962800875273523, + "grad_norm": 0.2890625, + "learning_rate": 0.00010216204139267737, + "loss": 1.7533, + "step": 6315 + }, + { + "epoch": 1.9631134729603001, + "grad_norm": 0.29296875, + "learning_rate": 0.00010213747650433081, + "loss": 1.5134, + "step": 6316 + }, + { + "epoch": 1.9634260706470772, + "grad_norm": 0.287109375, + "learning_rate": 0.00010211291148694204, + "loss": 1.4083, + "step": 6317 + }, + { + "epoch": 1.9637386683338542, + "grad_norm": 0.3046875, + "learning_rate": 0.00010208834634199418, + "loss": 1.4584, + "step": 6318 + }, + { + "epoch": 1.9640512660206313, + "grad_norm": 0.30859375, + "learning_rate": 0.00010206378107097012, + "loss": 1.2652, + "step": 6319 + }, + { + "epoch": 1.9643638637074086, + "grad_norm": 0.2890625, + "learning_rate": 0.00010203921567535301, + "loss": 1.562, + "step": 6320 + }, + { + "epoch": 1.9646764613941858, + "grad_norm": 0.283203125, + "learning_rate": 0.00010201465015662583, + "loss": 1.5787, + "step": 6321 + }, + { + "epoch": 1.9649890590809629, + "grad_norm": 0.298828125, + "learning_rate": 0.00010199008451627166, + "loss": 1.546, + "step": 6322 + }, + { + "epoch": 1.96530165676774, + "grad_norm": 0.2890625, + "learning_rate": 0.00010196551875577354, + "loss": 1.2498, + "step": 6323 + }, + { + "epoch": 1.965614254454517, + "grad_norm": 0.271484375, + "learning_rate": 0.00010194095287661458, + "loss": 1.639, + "step": 6324 + }, + { + "epoch": 1.965926852141294, + "grad_norm": 0.310546875, + "learning_rate": 0.00010191638688027778, + "loss": 1.601, + "step": 6325 + }, + { + "epoch": 1.9662394498280713, + "grad_norm": 0.376953125, + "learning_rate": 0.0001018918207682463, + "loss": 2.1508, + "step": 6326 + }, + { + "epoch": 1.9665520475148484, + "grad_norm": 0.3046875, + "learning_rate": 0.00010186725454200316, + "loss": 1.4983, + "step": 6327 + }, + { + "epoch": 1.9668646452016256, + "grad_norm": 0.29296875, + "learning_rate": 0.00010184268820303149, + "loss": 1.6153, + "step": 6328 + }, + { + "epoch": 1.9671772428884027, + "grad_norm": 0.294921875, + "learning_rate": 0.00010181812175281438, + "loss": 1.541, + "step": 6329 + }, + { + "epoch": 1.9674898405751797, + "grad_norm": 0.287109375, + "learning_rate": 0.00010179355519283498, + "loss": 1.4212, + "step": 6330 + }, + { + "epoch": 1.9678024382619568, + "grad_norm": 0.298828125, + "learning_rate": 0.00010176898852457633, + "loss": 1.4843, + "step": 6331 + }, + { + "epoch": 1.9681150359487338, + "grad_norm": 0.2890625, + "learning_rate": 0.00010174442174952161, + "loss": 1.5049, + "step": 6332 + }, + { + "epoch": 1.968427633635511, + "grad_norm": 0.28515625, + "learning_rate": 0.00010171985486915389, + "loss": 1.4579, + "step": 6333 + }, + { + "epoch": 1.9687402313222884, + "grad_norm": 0.287109375, + "learning_rate": 0.00010169528788495637, + "loss": 1.3979, + "step": 6334 + }, + { + "epoch": 1.9690528290090654, + "grad_norm": 0.294921875, + "learning_rate": 0.00010167072079841216, + "loss": 1.4632, + "step": 6335 + }, + { + "epoch": 1.9693654266958425, + "grad_norm": 0.28125, + "learning_rate": 0.00010164615361100442, + "loss": 1.5255, + "step": 6336 + }, + { + "epoch": 1.9696780243826195, + "grad_norm": 0.28515625, + "learning_rate": 0.00010162158632421625, + "loss": 1.7949, + "step": 6337 + }, + { + "epoch": 1.9699906220693966, + "grad_norm": 0.306640625, + "learning_rate": 0.00010159701893953089, + "loss": 1.482, + "step": 6338 + }, + { + "epoch": 1.9703032197561738, + "grad_norm": 0.296875, + "learning_rate": 0.00010157245145843141, + "loss": 1.892, + "step": 6339 + }, + { + "epoch": 1.970615817442951, + "grad_norm": 0.302734375, + "learning_rate": 0.00010154788388240106, + "loss": 1.4037, + "step": 6340 + }, + { + "epoch": 1.9709284151297282, + "grad_norm": 0.27734375, + "learning_rate": 0.00010152331621292299, + "loss": 1.4799, + "step": 6341 + }, + { + "epoch": 1.9712410128165052, + "grad_norm": 0.291015625, + "learning_rate": 0.0001014987484514804, + "loss": 1.6288, + "step": 6342 + }, + { + "epoch": 1.9715536105032823, + "grad_norm": 0.30078125, + "learning_rate": 0.00010147418059955643, + "loss": 1.8096, + "step": 6343 + }, + { + "epoch": 1.9718662081900593, + "grad_norm": 0.291015625, + "learning_rate": 0.00010144961265863431, + "loss": 1.6755, + "step": 6344 + }, + { + "epoch": 1.9721788058768364, + "grad_norm": 0.296875, + "learning_rate": 0.0001014250446301972, + "loss": 1.594, + "step": 6345 + }, + { + "epoch": 1.9724914035636136, + "grad_norm": 0.310546875, + "learning_rate": 0.00010140047651572835, + "loss": 1.4913, + "step": 6346 + }, + { + "epoch": 1.972804001250391, + "grad_norm": 0.298828125, + "learning_rate": 0.00010137590831671093, + "loss": 1.6336, + "step": 6347 + }, + { + "epoch": 1.973116598937168, + "grad_norm": 0.34765625, + "learning_rate": 0.00010135134003462823, + "loss": 1.7255, + "step": 6348 + }, + { + "epoch": 1.973429196623945, + "grad_norm": 0.28515625, + "learning_rate": 0.00010132677167096333, + "loss": 1.7847, + "step": 6349 + }, + { + "epoch": 1.973741794310722, + "grad_norm": 0.40625, + "learning_rate": 0.00010130220322719958, + "loss": 2.4353, + "step": 6350 + }, + { + "epoch": 1.974054391997499, + "grad_norm": 0.296875, + "learning_rate": 0.00010127763470482014, + "loss": 1.3342, + "step": 6351 + }, + { + "epoch": 1.9743669896842764, + "grad_norm": 0.28515625, + "learning_rate": 0.0001012530661053083, + "loss": 1.5728, + "step": 6352 + }, + { + "epoch": 1.9746795873710534, + "grad_norm": 0.296875, + "learning_rate": 0.00010122849743014722, + "loss": 1.7076, + "step": 6353 + }, + { + "epoch": 1.9749921850578307, + "grad_norm": 0.2890625, + "learning_rate": 0.00010120392868082022, + "loss": 1.4061, + "step": 6354 + }, + { + "epoch": 1.9753047827446077, + "grad_norm": 0.291015625, + "learning_rate": 0.00010117935985881049, + "loss": 1.5505, + "step": 6355 + }, + { + "epoch": 1.9756173804313848, + "grad_norm": 0.287109375, + "learning_rate": 0.00010115479096560133, + "loss": 1.484, + "step": 6356 + }, + { + "epoch": 1.9759299781181618, + "grad_norm": 0.283203125, + "learning_rate": 0.00010113022200267593, + "loss": 1.3421, + "step": 6357 + }, + { + "epoch": 1.976242575804939, + "grad_norm": 0.275390625, + "learning_rate": 0.0001011056529715176, + "loss": 1.566, + "step": 6358 + }, + { + "epoch": 1.9765551734917162, + "grad_norm": 0.29296875, + "learning_rate": 0.00010108108387360961, + "loss": 1.6118, + "step": 6359 + }, + { + "epoch": 1.9768677711784934, + "grad_norm": 0.3046875, + "learning_rate": 0.0001010565147104352, + "loss": 1.6341, + "step": 6360 + }, + { + "epoch": 1.9771803688652705, + "grad_norm": 0.296875, + "learning_rate": 0.00010103194548347763, + "loss": 1.5569, + "step": 6361 + }, + { + "epoch": 1.9774929665520475, + "grad_norm": 0.28515625, + "learning_rate": 0.00010100737619422023, + "loss": 1.7467, + "step": 6362 + }, + { + "epoch": 1.9778055642388246, + "grad_norm": 0.283203125, + "learning_rate": 0.0001009828068441462, + "loss": 1.3301, + "step": 6363 + }, + { + "epoch": 1.9781181619256016, + "grad_norm": 0.291015625, + "learning_rate": 0.00010095823743473891, + "loss": 1.5612, + "step": 6364 + }, + { + "epoch": 1.978430759612379, + "grad_norm": 0.279296875, + "learning_rate": 0.00010093366796748158, + "loss": 1.7422, + "step": 6365 + }, + { + "epoch": 1.978743357299156, + "grad_norm": 0.27734375, + "learning_rate": 0.00010090909844385754, + "loss": 1.6783, + "step": 6366 + }, + { + "epoch": 1.9790559549859332, + "grad_norm": 0.291015625, + "learning_rate": 0.00010088452886535005, + "loss": 1.5214, + "step": 6367 + }, + { + "epoch": 1.9793685526727103, + "grad_norm": 0.302734375, + "learning_rate": 0.00010085995923344245, + "loss": 1.7021, + "step": 6368 + }, + { + "epoch": 1.9796811503594873, + "grad_norm": 0.294921875, + "learning_rate": 0.00010083538954961799, + "loss": 1.5531, + "step": 6369 + }, + { + "epoch": 1.9799937480462644, + "grad_norm": 0.287109375, + "learning_rate": 0.00010081081981536003, + "loss": 1.1152, + "step": 6370 + }, + { + "epoch": 1.9803063457330414, + "grad_norm": 0.298828125, + "learning_rate": 0.00010078625003215182, + "loss": 1.5247, + "step": 6371 + }, + { + "epoch": 1.9806189434198187, + "grad_norm": 0.314453125, + "learning_rate": 0.00010076168020147672, + "loss": 2.2323, + "step": 6372 + }, + { + "epoch": 1.980931541106596, + "grad_norm": 0.271484375, + "learning_rate": 0.00010073711032481799, + "loss": 1.494, + "step": 6373 + }, + { + "epoch": 1.981244138793373, + "grad_norm": 0.283203125, + "learning_rate": 0.00010071254040365896, + "loss": 1.4078, + "step": 6374 + }, + { + "epoch": 1.98155673648015, + "grad_norm": 0.279296875, + "learning_rate": 0.00010068797043948297, + "loss": 1.5707, + "step": 6375 + }, + { + "epoch": 1.9818693341669271, + "grad_norm": 0.28125, + "learning_rate": 0.00010066340043377333, + "loss": 1.4195, + "step": 6376 + }, + { + "epoch": 1.9821819318537042, + "grad_norm": 0.302734375, + "learning_rate": 0.00010063883038801338, + "loss": 1.3336, + "step": 6377 + }, + { + "epoch": 1.9824945295404814, + "grad_norm": 0.294921875, + "learning_rate": 0.00010061426030368641, + "loss": 1.5426, + "step": 6378 + }, + { + "epoch": 1.9828071272272585, + "grad_norm": 0.28515625, + "learning_rate": 0.00010058969018227575, + "loss": 1.4451, + "step": 6379 + }, + { + "epoch": 1.9831197249140358, + "grad_norm": 0.28515625, + "learning_rate": 0.00010056512002526475, + "loss": 1.5482, + "step": 6380 + }, + { + "epoch": 1.9834323226008128, + "grad_norm": 0.29296875, + "learning_rate": 0.00010054054983413673, + "loss": 1.3617, + "step": 6381 + }, + { + "epoch": 1.9837449202875899, + "grad_norm": 0.283203125, + "learning_rate": 0.00010051597961037505, + "loss": 1.5777, + "step": 6382 + }, + { + "epoch": 1.984057517974367, + "grad_norm": 0.28515625, + "learning_rate": 0.00010049140935546299, + "loss": 1.6081, + "step": 6383 + }, + { + "epoch": 1.984370115661144, + "grad_norm": 0.310546875, + "learning_rate": 0.00010046683907088395, + "loss": 1.4387, + "step": 6384 + }, + { + "epoch": 1.9846827133479212, + "grad_norm": 0.28125, + "learning_rate": 0.00010044226875812121, + "loss": 1.6327, + "step": 6385 + }, + { + "epoch": 1.9849953110346985, + "grad_norm": 0.287109375, + "learning_rate": 0.00010041769841865818, + "loss": 1.6473, + "step": 6386 + }, + { + "epoch": 1.9853079087214756, + "grad_norm": 0.3046875, + "learning_rate": 0.00010039312805397813, + "loss": 1.3955, + "step": 6387 + }, + { + "epoch": 1.9856205064082526, + "grad_norm": 0.296875, + "learning_rate": 0.00010036855766556446, + "loss": 1.2537, + "step": 6388 + }, + { + "epoch": 1.9859331040950297, + "grad_norm": 0.291015625, + "learning_rate": 0.00010034398725490051, + "loss": 1.8257, + "step": 6389 + }, + { + "epoch": 1.9862457017818067, + "grad_norm": 0.310546875, + "learning_rate": 0.00010031941682346957, + "loss": 1.5564, + "step": 6390 + }, + { + "epoch": 1.986558299468584, + "grad_norm": 0.291015625, + "learning_rate": 0.00010029484637275504, + "loss": 1.4865, + "step": 6391 + }, + { + "epoch": 1.986870897155361, + "grad_norm": 0.296875, + "learning_rate": 0.00010027027590424028, + "loss": 1.834, + "step": 6392 + }, + { + "epoch": 1.9871834948421383, + "grad_norm": 0.30078125, + "learning_rate": 0.00010024570541940858, + "loss": 1.6117, + "step": 6393 + }, + { + "epoch": 1.9874960925289153, + "grad_norm": 0.28125, + "learning_rate": 0.00010022113491974336, + "loss": 1.4604, + "step": 6394 + }, + { + "epoch": 1.9878086902156924, + "grad_norm": 0.2890625, + "learning_rate": 0.00010019656440672795, + "loss": 1.7833, + "step": 6395 + }, + { + "epoch": 1.9881212879024694, + "grad_norm": 0.2890625, + "learning_rate": 0.00010017199388184568, + "loss": 1.4786, + "step": 6396 + }, + { + "epoch": 1.9884338855892465, + "grad_norm": 0.29296875, + "learning_rate": 0.00010014742334657993, + "loss": 1.3509, + "step": 6397 + }, + { + "epoch": 1.9887464832760238, + "grad_norm": 0.302734375, + "learning_rate": 0.00010012285280241404, + "loss": 1.3208, + "step": 6398 + } + ], + "logging_steps": 1, + "max_steps": 12796, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 3199, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3076986683457536e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}