{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998014165618587, "eval_steps": 500, "global_step": 4956, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020173555620699328, "grad_norm": 2022.7607421875, "learning_rate": 6.711409395973155e-08, "loss": 1.7879, "step": 1 }, { "epoch": 0.00040347111241398657, "grad_norm": 22654.603515625, "learning_rate": 1.342281879194631e-07, "loss": 1.7023, "step": 2 }, { "epoch": 0.0006052066686209798, "grad_norm": 5139.21240234375, "learning_rate": 2.0134228187919465e-07, "loss": 2.1801, "step": 3 }, { "epoch": 0.0008069422248279731, "grad_norm": 2654.375, "learning_rate": 2.684563758389262e-07, "loss": 2.2953, "step": 4 }, { "epoch": 0.0010086777810349664, "grad_norm": 4316.03662109375, "learning_rate": 3.3557046979865777e-07, "loss": 1.7812, "step": 5 }, { "epoch": 0.0012104133372419596, "grad_norm": 7220.66845703125, "learning_rate": 4.026845637583893e-07, "loss": 1.8566, "step": 6 }, { "epoch": 0.001412148893448953, "grad_norm": 24308.83203125, "learning_rate": 4.6979865771812087e-07, "loss": 2.2305, "step": 7 }, { "epoch": 0.0016138844496559463, "grad_norm": 11270.6220703125, "learning_rate": 5.369127516778524e-07, "loss": 2.5174, "step": 8 }, { "epoch": 0.0018156200058629397, "grad_norm": 3041.62646484375, "learning_rate": 6.04026845637584e-07, "loss": 1.8983, "step": 9 }, { "epoch": 0.002017355562069933, "grad_norm": 4904.11328125, "learning_rate": 6.711409395973155e-07, "loss": 1.7648, "step": 10 }, { "epoch": 0.0022190911182769263, "grad_norm": 10130.5146484375, "learning_rate": 7.382550335570471e-07, "loss": 1.8282, "step": 11 }, { "epoch": 0.0024208266744839193, "grad_norm": 4326.14892578125, "learning_rate": 8.053691275167786e-07, "loss": 2.0227, "step": 12 }, { "epoch": 0.0026225622306909127, "grad_norm": 3840.538818359375, "learning_rate": 8.724832214765102e-07, "loss": 1.68, "step": 13 }, { "epoch": 0.002824297786897906, "grad_norm": 5577.01171875, "learning_rate": 9.395973154362417e-07, "loss": 1.7269, "step": 14 }, { "epoch": 0.0030260333431048995, "grad_norm": 6080.58642578125, "learning_rate": 1.006711409395973e-06, "loss": 1.6123, "step": 15 }, { "epoch": 0.0032277688993118925, "grad_norm": 2213.759521484375, "learning_rate": 1.0738255033557048e-06, "loss": 2.2998, "step": 16 }, { "epoch": 0.003429504455518886, "grad_norm": 5135.35693359375, "learning_rate": 1.1409395973154363e-06, "loss": 2.0855, "step": 17 }, { "epoch": 0.0036312400117258794, "grad_norm": 4677.1923828125, "learning_rate": 1.208053691275168e-06, "loss": 2.3382, "step": 18 }, { "epoch": 0.0038329755679328724, "grad_norm": 1373.9124755859375, "learning_rate": 1.2751677852348996e-06, "loss": 1.5623, "step": 19 }, { "epoch": 0.004034711124139866, "grad_norm": 1056.8350830078125, "learning_rate": 1.342281879194631e-06, "loss": 1.7456, "step": 20 }, { "epoch": 0.004236446680346859, "grad_norm": 25182.080078125, "learning_rate": 1.4093959731543623e-06, "loss": 1.5903, "step": 21 }, { "epoch": 0.004438182236553853, "grad_norm": 10851.70703125, "learning_rate": 1.4765100671140942e-06, "loss": 2.3867, "step": 22 }, { "epoch": 0.004639917792760846, "grad_norm": 6197.96728515625, "learning_rate": 1.5436241610738257e-06, "loss": 2.0905, "step": 23 }, { "epoch": 0.004841653348967839, "grad_norm": 1165.3834228515625, "learning_rate": 1.6107382550335572e-06, "loss": 1.5915, "step": 24 }, { "epoch": 0.005043388905174832, "grad_norm": 7833.49169921875, "learning_rate": 1.6778523489932889e-06, "loss": 1.593, "step": 25 }, { "epoch": 0.005245124461381825, "grad_norm": 10110.6279296875, "learning_rate": 1.7449664429530203e-06, "loss": 1.9986, "step": 26 }, { "epoch": 0.005446860017588819, "grad_norm": 6629.67138671875, "learning_rate": 1.8120805369127518e-06, "loss": 2.5279, "step": 27 }, { "epoch": 0.005648595573795812, "grad_norm": 4542.9150390625, "learning_rate": 1.8791946308724835e-06, "loss": 1.6785, "step": 28 }, { "epoch": 0.005850331130002806, "grad_norm": 2227.125, "learning_rate": 1.9463087248322147e-06, "loss": 1.4933, "step": 29 }, { "epoch": 0.006052066686209799, "grad_norm": 788.425048828125, "learning_rate": 2.013422818791946e-06, "loss": 1.4813, "step": 30 }, { "epoch": 0.006253802242416792, "grad_norm": 3233.39111328125, "learning_rate": 2.080536912751678e-06, "loss": 1.4689, "step": 31 }, { "epoch": 0.006455537798623785, "grad_norm": 1077.7388916015625, "learning_rate": 2.1476510067114096e-06, "loss": 1.4551, "step": 32 }, { "epoch": 0.0066572733548307785, "grad_norm": 800.4613647460938, "learning_rate": 2.2147651006711415e-06, "loss": 1.4164, "step": 33 }, { "epoch": 0.006859008911037772, "grad_norm": 329.1565856933594, "learning_rate": 2.2818791946308725e-06, "loss": 1.4257, "step": 34 }, { "epoch": 0.007060744467244765, "grad_norm": 480.46246337890625, "learning_rate": 2.348993288590604e-06, "loss": 1.5043, "step": 35 }, { "epoch": 0.007262480023451759, "grad_norm": 620.2860717773438, "learning_rate": 2.416107382550336e-06, "loss": 1.5757, "step": 36 }, { "epoch": 0.007464215579658751, "grad_norm": 472.3393859863281, "learning_rate": 2.4832214765100673e-06, "loss": 1.3904, "step": 37 }, { "epoch": 0.007665951135865745, "grad_norm": 845.4601440429688, "learning_rate": 2.5503355704697992e-06, "loss": 1.6462, "step": 38 }, { "epoch": 0.007867686692072738, "grad_norm": 1025.87548828125, "learning_rate": 2.6174496644295307e-06, "loss": 1.8661, "step": 39 }, { "epoch": 0.008069422248279732, "grad_norm": 219.7495880126953, "learning_rate": 2.684563758389262e-06, "loss": 1.4307, "step": 40 }, { "epoch": 0.008271157804486725, "grad_norm": 102.17412567138672, "learning_rate": 2.7516778523489936e-06, "loss": 1.2734, "step": 41 }, { "epoch": 0.008472893360693718, "grad_norm": 1972.4371337890625, "learning_rate": 2.8187919463087247e-06, "loss": 1.2536, "step": 42 }, { "epoch": 0.008674628916900712, "grad_norm": 7863.28759765625, "learning_rate": 2.885906040268457e-06, "loss": 1.5733, "step": 43 }, { "epoch": 0.008876364473107705, "grad_norm": 493.8641357421875, "learning_rate": 2.9530201342281885e-06, "loss": 1.3364, "step": 44 }, { "epoch": 0.009078100029314699, "grad_norm": 128.04525756835938, "learning_rate": 3.02013422818792e-06, "loss": 1.3062, "step": 45 }, { "epoch": 0.009279835585521692, "grad_norm": 1128.474609375, "learning_rate": 3.0872483221476514e-06, "loss": 1.3676, "step": 46 }, { "epoch": 0.009481571141728684, "grad_norm": 47.335845947265625, "learning_rate": 3.154362416107383e-06, "loss": 1.2489, "step": 47 }, { "epoch": 0.009683306697935677, "grad_norm": 38.27725601196289, "learning_rate": 3.2214765100671143e-06, "loss": 1.271, "step": 48 }, { "epoch": 0.00988504225414267, "grad_norm": 83.20748901367188, "learning_rate": 3.2885906040268462e-06, "loss": 1.8054, "step": 49 }, { "epoch": 0.010086777810349664, "grad_norm": 232.52198791503906, "learning_rate": 3.3557046979865777e-06, "loss": 1.3684, "step": 50 }, { "epoch": 0.010288513366556657, "grad_norm": 46.04096221923828, "learning_rate": 3.422818791946309e-06, "loss": 1.2638, "step": 51 }, { "epoch": 0.01049024892276365, "grad_norm": 91.04130554199219, "learning_rate": 3.4899328859060407e-06, "loss": 1.1821, "step": 52 }, { "epoch": 0.010691984478970644, "grad_norm": 75.88416290283203, "learning_rate": 3.557046979865772e-06, "loss": 1.4274, "step": 53 }, { "epoch": 0.010893720035177638, "grad_norm": 702.1119995117188, "learning_rate": 3.6241610738255036e-06, "loss": 1.1664, "step": 54 }, { "epoch": 0.011095455591384631, "grad_norm": 21.94797706604004, "learning_rate": 3.6912751677852355e-06, "loss": 1.4295, "step": 55 }, { "epoch": 0.011297191147591625, "grad_norm": 25.085304260253906, "learning_rate": 3.758389261744967e-06, "loss": 1.2236, "step": 56 }, { "epoch": 0.011498926703798618, "grad_norm": 9.833930969238281, "learning_rate": 3.825503355704698e-06, "loss": 1.1847, "step": 57 }, { "epoch": 0.011700662260005611, "grad_norm": 12.627878189086914, "learning_rate": 3.8926174496644295e-06, "loss": 1.0892, "step": 58 }, { "epoch": 0.011902397816212605, "grad_norm": 17.127168655395508, "learning_rate": 3.959731543624161e-06, "loss": 1.264, "step": 59 }, { "epoch": 0.012104133372419598, "grad_norm": 9.815498352050781, "learning_rate": 4.026845637583892e-06, "loss": 1.0999, "step": 60 }, { "epoch": 0.01230586892862659, "grad_norm": 23.42744255065918, "learning_rate": 4.093959731543625e-06, "loss": 1.0759, "step": 61 }, { "epoch": 0.012507604484833583, "grad_norm": 32.74321746826172, "learning_rate": 4.161073825503356e-06, "loss": 1.3767, "step": 62 }, { "epoch": 0.012709340041040577, "grad_norm": 10.060574531555176, "learning_rate": 4.228187919463088e-06, "loss": 1.137, "step": 63 }, { "epoch": 0.01291107559724757, "grad_norm": 25.0057373046875, "learning_rate": 4.295302013422819e-06, "loss": 1.3044, "step": 64 }, { "epoch": 0.013112811153454564, "grad_norm": 20.90718650817871, "learning_rate": 4.362416107382551e-06, "loss": 1.0647, "step": 65 }, { "epoch": 0.013314546709661557, "grad_norm": 11.432819366455078, "learning_rate": 4.429530201342283e-06, "loss": 1.1671, "step": 66 }, { "epoch": 0.01351628226586855, "grad_norm": 35.13310241699219, "learning_rate": 4.4966442953020135e-06, "loss": 1.0784, "step": 67 }, { "epoch": 0.013718017822075544, "grad_norm": 18.177671432495117, "learning_rate": 4.563758389261745e-06, "loss": 1.0769, "step": 68 }, { "epoch": 0.013919753378282537, "grad_norm": 239.5286102294922, "learning_rate": 4.6308724832214765e-06, "loss": 1.2211, "step": 69 }, { "epoch": 0.01412148893448953, "grad_norm": 6.744454383850098, "learning_rate": 4.697986577181208e-06, "loss": 1.1171, "step": 70 }, { "epoch": 0.014323224490696524, "grad_norm": 5.52532434463501, "learning_rate": 4.765100671140939e-06, "loss": 1.0494, "step": 71 }, { "epoch": 0.014524960046903517, "grad_norm": 4.606289863586426, "learning_rate": 4.832214765100672e-06, "loss": 1.0316, "step": 72 }, { "epoch": 0.014726695603110511, "grad_norm": 60.82453536987305, "learning_rate": 4.899328859060403e-06, "loss": 1.0452, "step": 73 }, { "epoch": 0.014928431159317503, "grad_norm": 6.075560569763184, "learning_rate": 4.966442953020135e-06, "loss": 1.1055, "step": 74 }, { "epoch": 0.015130166715524496, "grad_norm": 28.668581008911133, "learning_rate": 5.033557046979867e-06, "loss": 1.3083, "step": 75 }, { "epoch": 0.01533190227173149, "grad_norm": 5.29589319229126, "learning_rate": 5.1006711409395985e-06, "loss": 1.3318, "step": 76 }, { "epoch": 0.015533637827938483, "grad_norm": 11.307202339172363, "learning_rate": 5.16778523489933e-06, "loss": 1.0679, "step": 77 }, { "epoch": 0.015735373384145476, "grad_norm": 12.506169319152832, "learning_rate": 5.234899328859061e-06, "loss": 1.0901, "step": 78 }, { "epoch": 0.01593710894035247, "grad_norm": 3.876575469970703, "learning_rate": 5.302013422818793e-06, "loss": 1.2399, "step": 79 }, { "epoch": 0.016138844496559463, "grad_norm": 5.603764533996582, "learning_rate": 5.369127516778524e-06, "loss": 0.9899, "step": 80 }, { "epoch": 0.016340580052766455, "grad_norm": 3.92093563079834, "learning_rate": 5.436241610738256e-06, "loss": 1.0783, "step": 81 }, { "epoch": 0.01654231560897345, "grad_norm": 4.067335605621338, "learning_rate": 5.503355704697987e-06, "loss": 1.2255, "step": 82 }, { "epoch": 0.01674405116518044, "grad_norm": 23.241966247558594, "learning_rate": 5.570469798657718e-06, "loss": 1.0219, "step": 83 }, { "epoch": 0.016945786721387437, "grad_norm": 18.93113136291504, "learning_rate": 5.637583892617449e-06, "loss": 1.2462, "step": 84 }, { "epoch": 0.01714752227759443, "grad_norm": 3.556022882461548, "learning_rate": 5.704697986577181e-06, "loss": 1.0287, "step": 85 }, { "epoch": 0.017349257833801424, "grad_norm": 6.718225479125977, "learning_rate": 5.771812080536914e-06, "loss": 1.0052, "step": 86 }, { "epoch": 0.017550993390008415, "grad_norm": 16.812232971191406, "learning_rate": 5.8389261744966455e-06, "loss": 1.0208, "step": 87 }, { "epoch": 0.01775272894621541, "grad_norm": 10.673800468444824, "learning_rate": 5.906040268456377e-06, "loss": 1.0182, "step": 88 }, { "epoch": 0.017954464502422402, "grad_norm": 4.149665832519531, "learning_rate": 5.973154362416108e-06, "loss": 0.9561, "step": 89 }, { "epoch": 0.018156200058629397, "grad_norm": 412.8095703125, "learning_rate": 6.04026845637584e-06, "loss": 1.1994, "step": 90 }, { "epoch": 0.01835793561483639, "grad_norm": 10.464238166809082, "learning_rate": 6.107382550335571e-06, "loss": 0.9532, "step": 91 }, { "epoch": 0.018559671171043384, "grad_norm": 5.531954765319824, "learning_rate": 6.174496644295303e-06, "loss": 1.0273, "step": 92 }, { "epoch": 0.018761406727250376, "grad_norm": 13.847434997558594, "learning_rate": 6.241610738255034e-06, "loss": 1.2019, "step": 93 }, { "epoch": 0.018963142283457367, "grad_norm": 12.25539779663086, "learning_rate": 6.308724832214766e-06, "loss": 0.9668, "step": 94 }, { "epoch": 0.019164877839664363, "grad_norm": 7.894690036773682, "learning_rate": 6.375838926174497e-06, "loss": 0.9212, "step": 95 }, { "epoch": 0.019366613395871354, "grad_norm": 14.030834197998047, "learning_rate": 6.442953020134229e-06, "loss": 0.9981, "step": 96 }, { "epoch": 0.01956834895207835, "grad_norm": 3.4250216484069824, "learning_rate": 6.51006711409396e-06, "loss": 0.995, "step": 97 }, { "epoch": 0.01977008450828534, "grad_norm": 6.011563301086426, "learning_rate": 6.5771812080536925e-06, "loss": 1.0305, "step": 98 }, { "epoch": 0.019971820064492336, "grad_norm": 15.089917182922363, "learning_rate": 6.644295302013424e-06, "loss": 1.0044, "step": 99 }, { "epoch": 0.020173555620699328, "grad_norm": 9.317388534545898, "learning_rate": 6.711409395973155e-06, "loss": 0.9269, "step": 100 }, { "epoch": 0.020375291176906323, "grad_norm": 17.396928787231445, "learning_rate": 6.778523489932887e-06, "loss": 1.1154, "step": 101 }, { "epoch": 0.020577026733113315, "grad_norm": 5.767102241516113, "learning_rate": 6.845637583892618e-06, "loss": 1.0193, "step": 102 }, { "epoch": 0.02077876228932031, "grad_norm": 38.032161712646484, "learning_rate": 6.91275167785235e-06, "loss": 0.9445, "step": 103 }, { "epoch": 0.0209804978455273, "grad_norm": 23.336257934570312, "learning_rate": 6.979865771812081e-06, "loss": 1.149, "step": 104 }, { "epoch": 0.021182233401734297, "grad_norm": 84.36126708984375, "learning_rate": 7.046979865771813e-06, "loss": 0.9812, "step": 105 }, { "epoch": 0.02138396895794129, "grad_norm": 245.55101013183594, "learning_rate": 7.114093959731544e-06, "loss": 0.9889, "step": 106 }, { "epoch": 0.021585704514148284, "grad_norm": 17.846338272094727, "learning_rate": 7.181208053691276e-06, "loss": 1.1272, "step": 107 }, { "epoch": 0.021787440070355275, "grad_norm": 36.46464157104492, "learning_rate": 7.248322147651007e-06, "loss": 0.9686, "step": 108 }, { "epoch": 0.021989175626562267, "grad_norm": 3.497917413711548, "learning_rate": 7.3154362416107395e-06, "loss": 0.9897, "step": 109 }, { "epoch": 0.022190911182769262, "grad_norm": 15.46224594116211, "learning_rate": 7.382550335570471e-06, "loss": 1.3466, "step": 110 }, { "epoch": 0.022392646738976254, "grad_norm": 18.964536666870117, "learning_rate": 7.4496644295302024e-06, "loss": 1.3584, "step": 111 }, { "epoch": 0.02259438229518325, "grad_norm": 7.033268928527832, "learning_rate": 7.516778523489934e-06, "loss": 0.9985, "step": 112 }, { "epoch": 0.02279611785139024, "grad_norm": 186.49122619628906, "learning_rate": 7.583892617449665e-06, "loss": 0.9712, "step": 113 }, { "epoch": 0.022997853407597236, "grad_norm": 188.2796630859375, "learning_rate": 7.651006711409396e-06, "loss": 1.1078, "step": 114 }, { "epoch": 0.023199588963804228, "grad_norm": 17.890708923339844, "learning_rate": 7.718120805369127e-06, "loss": 0.9729, "step": 115 }, { "epoch": 0.023401324520011223, "grad_norm": 15.275798797607422, "learning_rate": 7.785234899328859e-06, "loss": 1.2821, "step": 116 }, { "epoch": 0.023603060076218214, "grad_norm": 20.20365333557129, "learning_rate": 7.85234899328859e-06, "loss": 1.0977, "step": 117 }, { "epoch": 0.02380479563242521, "grad_norm": 5.463934421539307, "learning_rate": 7.919463087248322e-06, "loss": 0.9282, "step": 118 }, { "epoch": 0.0240065311886322, "grad_norm": 4.599616050720215, "learning_rate": 7.986577181208053e-06, "loss": 0.9563, "step": 119 }, { "epoch": 0.024208266744839196, "grad_norm": 10.358293533325195, "learning_rate": 8.053691275167785e-06, "loss": 1.1196, "step": 120 }, { "epoch": 0.024410002301046188, "grad_norm": 10.99390697479248, "learning_rate": 8.120805369127518e-06, "loss": 0.9119, "step": 121 }, { "epoch": 0.02461173785725318, "grad_norm": 226.3212432861328, "learning_rate": 8.18791946308725e-06, "loss": 0.9652, "step": 122 }, { "epoch": 0.024813473413460175, "grad_norm": 188.92813110351562, "learning_rate": 8.255033557046981e-06, "loss": 0.9699, "step": 123 }, { "epoch": 0.025015208969667167, "grad_norm": 2.4782161712646484, "learning_rate": 8.322147651006712e-06, "loss": 0.9455, "step": 124 }, { "epoch": 0.025216944525874162, "grad_norm": 27.041120529174805, "learning_rate": 8.389261744966444e-06, "loss": 1.1747, "step": 125 }, { "epoch": 0.025418680082081153, "grad_norm": 8.053828239440918, "learning_rate": 8.456375838926175e-06, "loss": 1.1885, "step": 126 }, { "epoch": 0.02562041563828815, "grad_norm": 3.6767969131469727, "learning_rate": 8.523489932885907e-06, "loss": 1.0123, "step": 127 }, { "epoch": 0.02582215119449514, "grad_norm": 7.078328609466553, "learning_rate": 8.590604026845638e-06, "loss": 0.9259, "step": 128 }, { "epoch": 0.026023886750702135, "grad_norm": 15.417545318603516, "learning_rate": 8.65771812080537e-06, "loss": 0.9523, "step": 129 }, { "epoch": 0.026225622306909127, "grad_norm": 12.301921844482422, "learning_rate": 8.724832214765101e-06, "loss": 0.9276, "step": 130 }, { "epoch": 0.026427357863116122, "grad_norm": 13.825284957885742, "learning_rate": 8.791946308724833e-06, "loss": 0.8871, "step": 131 }, { "epoch": 0.026629093419323114, "grad_norm": 4.011205196380615, "learning_rate": 8.859060402684566e-06, "loss": 0.959, "step": 132 }, { "epoch": 0.02683082897553011, "grad_norm": 6.832649230957031, "learning_rate": 8.926174496644297e-06, "loss": 0.9345, "step": 133 }, { "epoch": 0.0270325645317371, "grad_norm": 10.89799976348877, "learning_rate": 8.993288590604027e-06, "loss": 0.9571, "step": 134 }, { "epoch": 0.027234300087944092, "grad_norm": 15.070175170898438, "learning_rate": 9.060402684563759e-06, "loss": 0.9965, "step": 135 }, { "epoch": 0.027436035644151088, "grad_norm": 9.60452938079834, "learning_rate": 9.12751677852349e-06, "loss": 0.9057, "step": 136 }, { "epoch": 0.02763777120035808, "grad_norm": 4.11983060836792, "learning_rate": 9.194630872483221e-06, "loss": 0.9351, "step": 137 }, { "epoch": 0.027839506756565074, "grad_norm": 8.37743091583252, "learning_rate": 9.261744966442953e-06, "loss": 1.1355, "step": 138 }, { "epoch": 0.028041242312772066, "grad_norm": 4.7272820472717285, "learning_rate": 9.328859060402684e-06, "loss": 1.0328, "step": 139 }, { "epoch": 0.02824297786897906, "grad_norm": 11.339620590209961, "learning_rate": 9.395973154362416e-06, "loss": 1.3605, "step": 140 }, { "epoch": 0.028444713425186053, "grad_norm": 295.88519287109375, "learning_rate": 9.463087248322147e-06, "loss": 0.9109, "step": 141 }, { "epoch": 0.028646448981393048, "grad_norm": 70.61186218261719, "learning_rate": 9.530201342281879e-06, "loss": 0.9349, "step": 142 }, { "epoch": 0.02884818453760004, "grad_norm": 14.863121032714844, "learning_rate": 9.59731543624161e-06, "loss": 0.9364, "step": 143 }, { "epoch": 0.029049920093807035, "grad_norm": 9.727048873901367, "learning_rate": 9.664429530201343e-06, "loss": 0.8834, "step": 144 }, { "epoch": 0.029251655650014027, "grad_norm": 2.5803744792938232, "learning_rate": 9.731543624161075e-06, "loss": 0.9208, "step": 145 }, { "epoch": 0.029453391206221022, "grad_norm": 9.345537185668945, "learning_rate": 9.798657718120806e-06, "loss": 0.9386, "step": 146 }, { "epoch": 0.029655126762428013, "grad_norm": 5.340958118438721, "learning_rate": 9.865771812080538e-06, "loss": 1.2306, "step": 147 }, { "epoch": 0.029856862318635005, "grad_norm": 11.54121208190918, "learning_rate": 9.93288590604027e-06, "loss": 0.9641, "step": 148 }, { "epoch": 0.030058597874842, "grad_norm": 16.07155418395996, "learning_rate": 1e-05, "loss": 0.8883, "step": 149 }, { "epoch": 0.030260333431048992, "grad_norm": 2.79647159576416, "learning_rate": 9.999998932196122e-06, "loss": 0.9043, "step": 150 }, { "epoch": 0.030462068987255987, "grad_norm": 14.367652893066406, "learning_rate": 9.99999572878494e-06, "loss": 0.9166, "step": 151 }, { "epoch": 0.03066380454346298, "grad_norm": 2.824852466583252, "learning_rate": 9.999990389767822e-06, "loss": 0.9134, "step": 152 }, { "epoch": 0.030865540099669974, "grad_norm": 3.328451156616211, "learning_rate": 9.999982915147052e-06, "loss": 1.3344, "step": 153 }, { "epoch": 0.031067275655876966, "grad_norm": 7.940481185913086, "learning_rate": 9.99997330492582e-06, "loss": 1.0262, "step": 154 }, { "epoch": 0.03126901121208396, "grad_norm": 5.5620856285095215, "learning_rate": 9.999961559108231e-06, "loss": 0.9052, "step": 155 }, { "epoch": 0.03147074676829095, "grad_norm": 6.570486068725586, "learning_rate": 9.999947677699302e-06, "loss": 0.8743, "step": 156 }, { "epoch": 0.031672482324497944, "grad_norm": 2.9782333374023438, "learning_rate": 9.999931660704962e-06, "loss": 0.9057, "step": 157 }, { "epoch": 0.03187421788070494, "grad_norm": 6.992729663848877, "learning_rate": 9.999913508132052e-06, "loss": 1.0055, "step": 158 }, { "epoch": 0.032075953436911935, "grad_norm": 3.99275803565979, "learning_rate": 9.999893219988329e-06, "loss": 0.9025, "step": 159 }, { "epoch": 0.032277688993118926, "grad_norm": 10.511117935180664, "learning_rate": 9.999870796282452e-06, "loss": 0.8839, "step": 160 }, { "epoch": 0.03247942454932592, "grad_norm": 3.7621190547943115, "learning_rate": 9.999846237024003e-06, "loss": 0.8776, "step": 161 }, { "epoch": 0.03268116010553291, "grad_norm": 2.77579402923584, "learning_rate": 9.99981954222347e-06, "loss": 0.8678, "step": 162 }, { "epoch": 0.03288289566173991, "grad_norm": 5.247135162353516, "learning_rate": 9.999790711892255e-06, "loss": 0.9416, "step": 163 }, { "epoch": 0.0330846312179469, "grad_norm": 3.735067844390869, "learning_rate": 9.999759746042674e-06, "loss": 0.8518, "step": 164 }, { "epoch": 0.03328636677415389, "grad_norm": 3.955429792404175, "learning_rate": 9.999726644687952e-06, "loss": 0.8846, "step": 165 }, { "epoch": 0.03348810233036088, "grad_norm": 7.231546401977539, "learning_rate": 9.999691407842228e-06, "loss": 0.8946, "step": 166 }, { "epoch": 0.03368983788656788, "grad_norm": 22.583908081054688, "learning_rate": 9.999654035520548e-06, "loss": 1.0744, "step": 167 }, { "epoch": 0.033891573442774874, "grad_norm": 31.7758846282959, "learning_rate": 9.999614527738882e-06, "loss": 0.8924, "step": 168 }, { "epoch": 0.034093308998981865, "grad_norm": 3.467855453491211, "learning_rate": 9.999572884514098e-06, "loss": 0.8897, "step": 169 }, { "epoch": 0.03429504455518886, "grad_norm": 3.608447551727295, "learning_rate": 9.999529105863986e-06, "loss": 0.9032, "step": 170 }, { "epoch": 0.034496780111395856, "grad_norm": 3.091294050216675, "learning_rate": 9.999483191807245e-06, "loss": 0.863, "step": 171 }, { "epoch": 0.03469851566760285, "grad_norm": 6.513212203979492, "learning_rate": 9.999435142363484e-06, "loss": 0.8868, "step": 172 }, { "epoch": 0.03490025122380984, "grad_norm": 1.8056596517562866, "learning_rate": 9.999384957553228e-06, "loss": 0.8478, "step": 173 }, { "epoch": 0.03510198678001683, "grad_norm": 3.0379676818847656, "learning_rate": 9.99933263739791e-06, "loss": 1.0226, "step": 174 }, { "epoch": 0.03530372233622382, "grad_norm": 2.665609121322632, "learning_rate": 9.99927818191988e-06, "loss": 0.9337, "step": 175 }, { "epoch": 0.03550545789243082, "grad_norm": 8.217555046081543, "learning_rate": 9.999221591142395e-06, "loss": 0.8608, "step": 176 }, { "epoch": 0.03570719344863781, "grad_norm": 2.4761505126953125, "learning_rate": 9.999162865089625e-06, "loss": 0.858, "step": 177 }, { "epoch": 0.035908929004844804, "grad_norm": 4.345452308654785, "learning_rate": 9.999102003786655e-06, "loss": 1.0062, "step": 178 }, { "epoch": 0.036110664561051796, "grad_norm": 3.3440897464752197, "learning_rate": 9.99903900725948e-06, "loss": 0.8589, "step": 179 }, { "epoch": 0.036312400117258795, "grad_norm": 4.387753009796143, "learning_rate": 9.998973875535006e-06, "loss": 0.8124, "step": 180 }, { "epoch": 0.036514135673465786, "grad_norm": 2.263735055923462, "learning_rate": 9.998906608641055e-06, "loss": 0.8761, "step": 181 }, { "epoch": 0.03671587122967278, "grad_norm": 4.443448066711426, "learning_rate": 9.998837206606355e-06, "loss": 0.808, "step": 182 }, { "epoch": 0.03691760678587977, "grad_norm": 2.865161895751953, "learning_rate": 9.998765669460551e-06, "loss": 0.9949, "step": 183 }, { "epoch": 0.03711934234208677, "grad_norm": 1.972935676574707, "learning_rate": 9.998691997234196e-06, "loss": 0.8248, "step": 184 }, { "epoch": 0.03732107789829376, "grad_norm": 2.0238354206085205, "learning_rate": 9.998616189958758e-06, "loss": 0.8415, "step": 185 }, { "epoch": 0.03752281345450075, "grad_norm": 2.7127320766448975, "learning_rate": 9.998538247666618e-06, "loss": 0.7998, "step": 186 }, { "epoch": 0.03772454901070774, "grad_norm": 2.8492398262023926, "learning_rate": 9.998458170391065e-06, "loss": 0.8178, "step": 187 }, { "epoch": 0.037926284566914735, "grad_norm": 2.2515549659729004, "learning_rate": 9.998375958166301e-06, "loss": 0.9846, "step": 188 }, { "epoch": 0.038128020123121734, "grad_norm": 2.6660799980163574, "learning_rate": 9.998291611027441e-06, "loss": 0.8962, "step": 189 }, { "epoch": 0.038329755679328725, "grad_norm": 2.132474660873413, "learning_rate": 9.998205129010515e-06, "loss": 0.8482, "step": 190 }, { "epoch": 0.03853149123553572, "grad_norm": 3.912343740463257, "learning_rate": 9.998116512152456e-06, "loss": 0.867, "step": 191 }, { "epoch": 0.03873322679174271, "grad_norm": 1.7469815015792847, "learning_rate": 9.998025760491117e-06, "loss": 1.018, "step": 192 }, { "epoch": 0.03893496234794971, "grad_norm": 1.599955439567566, "learning_rate": 9.997932874065259e-06, "loss": 0.9208, "step": 193 }, { "epoch": 0.0391366979041567, "grad_norm": 3.3400845527648926, "learning_rate": 9.997837852914557e-06, "loss": 0.8257, "step": 194 }, { "epoch": 0.03933843346036369, "grad_norm": 5.241814136505127, "learning_rate": 9.997740697079595e-06, "loss": 0.8121, "step": 195 }, { "epoch": 0.03954016901657068, "grad_norm": 3.9843716621398926, "learning_rate": 9.99764140660187e-06, "loss": 0.8926, "step": 196 }, { "epoch": 0.03974190457277768, "grad_norm": 2.6611931324005127, "learning_rate": 9.997539981523794e-06, "loss": 0.8292, "step": 197 }, { "epoch": 0.03994364012898467, "grad_norm": 0.9562042355537415, "learning_rate": 9.997436421888685e-06, "loss": 0.8107, "step": 198 }, { "epoch": 0.040145375685191664, "grad_norm": 3.046618700027466, "learning_rate": 9.997330727740778e-06, "loss": 0.9869, "step": 199 }, { "epoch": 0.040347111241398656, "grad_norm": 3.274746894836426, "learning_rate": 9.997222899125214e-06, "loss": 0.7945, "step": 200 }, { "epoch": 0.04054884679760565, "grad_norm": 2.0076584815979004, "learning_rate": 9.997112936088052e-06, "loss": 0.9723, "step": 201 }, { "epoch": 0.040750582353812646, "grad_norm": 2.4189558029174805, "learning_rate": 9.997000838676258e-06, "loss": 0.8047, "step": 202 }, { "epoch": 0.04095231791001964, "grad_norm": 1.8270951509475708, "learning_rate": 9.996886606937712e-06, "loss": 0.8327, "step": 203 }, { "epoch": 0.04115405346622663, "grad_norm": 1.598482370376587, "learning_rate": 9.996770240921205e-06, "loss": 0.8465, "step": 204 }, { "epoch": 0.04135578902243362, "grad_norm": 3.2114081382751465, "learning_rate": 9.996651740676439e-06, "loss": 0.7961, "step": 205 }, { "epoch": 0.04155752457864062, "grad_norm": 1.7791095972061157, "learning_rate": 9.996531106254027e-06, "loss": 0.788, "step": 206 }, { "epoch": 0.04175926013484761, "grad_norm": 4.112730503082275, "learning_rate": 9.996408337705497e-06, "loss": 0.7666, "step": 207 }, { "epoch": 0.0419609956910546, "grad_norm": 4.838507175445557, "learning_rate": 9.996283435083282e-06, "loss": 0.8164, "step": 208 }, { "epoch": 0.042162731247261595, "grad_norm": 7.356931209564209, "learning_rate": 9.996156398440735e-06, "loss": 0.8305, "step": 209 }, { "epoch": 0.042364466803468594, "grad_norm": 21.874197006225586, "learning_rate": 9.996027227832114e-06, "loss": 0.806, "step": 210 }, { "epoch": 0.042566202359675585, "grad_norm": 9.663326263427734, "learning_rate": 9.99589592331259e-06, "loss": 0.7798, "step": 211 }, { "epoch": 0.04276793791588258, "grad_norm": 3.5580976009368896, "learning_rate": 9.995762484938247e-06, "loss": 0.8254, "step": 212 }, { "epoch": 0.04296967347208957, "grad_norm": 5.485879898071289, "learning_rate": 9.995626912766081e-06, "loss": 0.7719, "step": 213 }, { "epoch": 0.04317140902829657, "grad_norm": 1.792300820350647, "learning_rate": 9.995489206853995e-06, "loss": 0.8067, "step": 214 }, { "epoch": 0.04337314458450356, "grad_norm": 2.1983845233917236, "learning_rate": 9.995349367260807e-06, "loss": 0.762, "step": 215 }, { "epoch": 0.04357488014071055, "grad_norm": 7.275660991668701, "learning_rate": 9.995207394046245e-06, "loss": 0.7808, "step": 216 }, { "epoch": 0.04377661569691754, "grad_norm": 1.4585134983062744, "learning_rate": 9.99506328727095e-06, "loss": 0.7906, "step": 217 }, { "epoch": 0.043978351253124534, "grad_norm": 1.4152764081954956, "learning_rate": 9.994917046996472e-06, "loss": 0.8777, "step": 218 }, { "epoch": 0.04418008680933153, "grad_norm": 1.9685487747192383, "learning_rate": 9.994768673285275e-06, "loss": 0.9062, "step": 219 }, { "epoch": 0.044381822365538524, "grad_norm": 1.1828324794769287, "learning_rate": 9.99461816620073e-06, "loss": 1.0103, "step": 220 }, { "epoch": 0.044583557921745516, "grad_norm": 1.5477691888809204, "learning_rate": 9.994465525807125e-06, "loss": 0.8139, "step": 221 }, { "epoch": 0.04478529347795251, "grad_norm": 1.9244577884674072, "learning_rate": 9.994310752169654e-06, "loss": 0.7512, "step": 222 }, { "epoch": 0.044987029034159506, "grad_norm": 1.0615133047103882, "learning_rate": 9.994153845354426e-06, "loss": 0.7983, "step": 223 }, { "epoch": 0.0451887645903665, "grad_norm": 2.2280080318450928, "learning_rate": 9.993994805428456e-06, "loss": 1.1871, "step": 224 }, { "epoch": 0.04539050014657349, "grad_norm": 3.0566983222961426, "learning_rate": 9.993833632459675e-06, "loss": 0.7883, "step": 225 }, { "epoch": 0.04559223570278048, "grad_norm": 1.9731240272521973, "learning_rate": 9.993670326516924e-06, "loss": 0.8278, "step": 226 }, { "epoch": 0.04579397125898748, "grad_norm": 1.77347731590271, "learning_rate": 9.993504887669955e-06, "loss": 0.7994, "step": 227 }, { "epoch": 0.04599570681519447, "grad_norm": 12.29296588897705, "learning_rate": 9.993337315989428e-06, "loss": 0.7465, "step": 228 }, { "epoch": 0.04619744237140146, "grad_norm": 5.521291732788086, "learning_rate": 9.99316761154692e-06, "loss": 0.8043, "step": 229 }, { "epoch": 0.046399177927608455, "grad_norm": 2.6495323181152344, "learning_rate": 9.992995774414912e-06, "loss": 0.8121, "step": 230 }, { "epoch": 0.04660091348381545, "grad_norm": 1.0269204378128052, "learning_rate": 9.992821804666803e-06, "loss": 0.8266, "step": 231 }, { "epoch": 0.046802649040022445, "grad_norm": 7.434131622314453, "learning_rate": 9.992645702376896e-06, "loss": 0.7602, "step": 232 }, { "epoch": 0.04700438459622944, "grad_norm": 36.971431732177734, "learning_rate": 9.992467467620408e-06, "loss": 0.8083, "step": 233 }, { "epoch": 0.04720612015243643, "grad_norm": 57.72653579711914, "learning_rate": 9.99228710047347e-06, "loss": 0.8107, "step": 234 }, { "epoch": 0.04740785570864342, "grad_norm": 15.51785659790039, "learning_rate": 9.992104601013117e-06, "loss": 0.7832, "step": 235 }, { "epoch": 0.04760959126485042, "grad_norm": 1.4890369176864624, "learning_rate": 9.9919199693173e-06, "loss": 0.7787, "step": 236 }, { "epoch": 0.04781132682105741, "grad_norm": 1.1738063097000122, "learning_rate": 9.991733205464882e-06, "loss": 0.7906, "step": 237 }, { "epoch": 0.0480130623772644, "grad_norm": 2.434535026550293, "learning_rate": 9.99154430953563e-06, "loss": 0.7879, "step": 238 }, { "epoch": 0.048214797933471394, "grad_norm": 1.8027598857879639, "learning_rate": 9.991353281610227e-06, "loss": 0.7958, "step": 239 }, { "epoch": 0.04841653348967839, "grad_norm": 1.322300672531128, "learning_rate": 9.991160121770265e-06, "loss": 0.9135, "step": 240 }, { "epoch": 0.048618269045885384, "grad_norm": 3.259050130844116, "learning_rate": 9.990964830098246e-06, "loss": 0.9424, "step": 241 }, { "epoch": 0.048820004602092376, "grad_norm": 1.3933802843093872, "learning_rate": 9.990767406677585e-06, "loss": 0.7914, "step": 242 }, { "epoch": 0.04902174015829937, "grad_norm": 1.3619595766067505, "learning_rate": 9.990567851592604e-06, "loss": 0.7864, "step": 243 }, { "epoch": 0.04922347571450636, "grad_norm": 2.9409220218658447, "learning_rate": 9.990366164928538e-06, "loss": 0.8433, "step": 244 }, { "epoch": 0.04942521127071336, "grad_norm": 1.1837221384048462, "learning_rate": 9.990162346771532e-06, "loss": 0.7411, "step": 245 }, { "epoch": 0.04962694682692035, "grad_norm": 0.7388036847114563, "learning_rate": 9.98995639720864e-06, "loss": 0.7743, "step": 246 }, { "epoch": 0.04982868238312734, "grad_norm": 1.7144968509674072, "learning_rate": 9.98974831632783e-06, "loss": 0.7733, "step": 247 }, { "epoch": 0.05003041793933433, "grad_norm": 0.677221417427063, "learning_rate": 9.989538104217975e-06, "loss": 0.7485, "step": 248 }, { "epoch": 0.05023215349554133, "grad_norm": 1.5204025506973267, "learning_rate": 9.989325760968865e-06, "loss": 0.8537, "step": 249 }, { "epoch": 0.050433889051748323, "grad_norm": 1.2902796268463135, "learning_rate": 9.98911128667119e-06, "loss": 0.7397, "step": 250 }, { "epoch": 0.050635624607955315, "grad_norm": 1.303731083869934, "learning_rate": 9.988894681416561e-06, "loss": 1.0365, "step": 251 }, { "epoch": 0.05083736016416231, "grad_norm": 3.029825448989868, "learning_rate": 9.988675945297497e-06, "loss": 0.9559, "step": 252 }, { "epoch": 0.051039095720369305, "grad_norm": 1.6813995838165283, "learning_rate": 9.98845507840742e-06, "loss": 0.8865, "step": 253 }, { "epoch": 0.0512408312765763, "grad_norm": 3.1005282402038574, "learning_rate": 9.988232080840668e-06, "loss": 0.7663, "step": 254 }, { "epoch": 0.05144256683278329, "grad_norm": 1.6702642440795898, "learning_rate": 9.98800695269249e-06, "loss": 0.7728, "step": 255 }, { "epoch": 0.05164430238899028, "grad_norm": 3.3101143836975098, "learning_rate": 9.987779694059043e-06, "loss": 0.8304, "step": 256 }, { "epoch": 0.05184603794519727, "grad_norm": 1.2295477390289307, "learning_rate": 9.987550305037392e-06, "loss": 0.7588, "step": 257 }, { "epoch": 0.05204777350140427, "grad_norm": 1.4161425828933716, "learning_rate": 9.987318785725517e-06, "loss": 0.8055, "step": 258 }, { "epoch": 0.05224950905761126, "grad_norm": 1.438620924949646, "learning_rate": 9.987085136222302e-06, "loss": 1.0795, "step": 259 }, { "epoch": 0.052451244613818254, "grad_norm": 1.5853941440582275, "learning_rate": 9.986849356627545e-06, "loss": 0.7861, "step": 260 }, { "epoch": 0.052652980170025246, "grad_norm": 1.0955591201782227, "learning_rate": 9.986611447041952e-06, "loss": 0.9781, "step": 261 }, { "epoch": 0.052854715726232245, "grad_norm": 9.280670166015625, "learning_rate": 9.98637140756714e-06, "loss": 0.7771, "step": 262 }, { "epoch": 0.053056451282439236, "grad_norm": 53.89901351928711, "learning_rate": 9.986129238305635e-06, "loss": 0.7909, "step": 263 }, { "epoch": 0.05325818683864623, "grad_norm": 0.7823047637939453, "learning_rate": 9.985884939360873e-06, "loss": 0.8604, "step": 264 }, { "epoch": 0.05345992239485322, "grad_norm": 14.755097389221191, "learning_rate": 9.985638510837197e-06, "loss": 0.861, "step": 265 }, { "epoch": 0.05366165795106022, "grad_norm": 0.7898758053779602, "learning_rate": 9.985389952839864e-06, "loss": 0.7715, "step": 266 }, { "epoch": 0.05386339350726721, "grad_norm": 1.4765191078186035, "learning_rate": 9.985139265475039e-06, "loss": 0.9422, "step": 267 }, { "epoch": 0.0540651290634742, "grad_norm": 0.8627459406852722, "learning_rate": 9.984886448849796e-06, "loss": 0.7241, "step": 268 }, { "epoch": 0.05426686461968119, "grad_norm": 1.3317475318908691, "learning_rate": 9.984631503072116e-06, "loss": 0.7417, "step": 269 }, { "epoch": 0.054468600175888185, "grad_norm": 0.6790952086448669, "learning_rate": 9.984374428250894e-06, "loss": 0.7364, "step": 270 }, { "epoch": 0.054670335732095184, "grad_norm": 6.166963577270508, "learning_rate": 9.984115224495933e-06, "loss": 0.7579, "step": 271 }, { "epoch": 0.054872071288302175, "grad_norm": 1.9955143928527832, "learning_rate": 9.983853891917942e-06, "loss": 0.884, "step": 272 }, { "epoch": 0.05507380684450917, "grad_norm": 4.1368889808654785, "learning_rate": 9.983590430628543e-06, "loss": 0.7424, "step": 273 }, { "epoch": 0.05527554240071616, "grad_norm": 3.234487295150757, "learning_rate": 9.983324840740265e-06, "loss": 0.8108, "step": 274 }, { "epoch": 0.05547727795692316, "grad_norm": 12.349787712097168, "learning_rate": 9.983057122366549e-06, "loss": 0.7564, "step": 275 }, { "epoch": 0.05567901351313015, "grad_norm": 2.3753366470336914, "learning_rate": 9.982787275621743e-06, "loss": 0.7861, "step": 276 }, { "epoch": 0.05588074906933714, "grad_norm": 4.729350566864014, "learning_rate": 9.982515300621103e-06, "loss": 0.7651, "step": 277 }, { "epoch": 0.05608248462554413, "grad_norm": 0.8417067527770996, "learning_rate": 9.982241197480795e-06, "loss": 0.7681, "step": 278 }, { "epoch": 0.05628422018175113, "grad_norm": 0.8154447674751282, "learning_rate": 9.981964966317897e-06, "loss": 0.7743, "step": 279 }, { "epoch": 0.05648595573795812, "grad_norm": 0.5765023231506348, "learning_rate": 9.981686607250391e-06, "loss": 0.958, "step": 280 }, { "epoch": 0.056687691294165114, "grad_norm": 1.1746493577957153, "learning_rate": 9.981406120397172e-06, "loss": 0.7329, "step": 281 }, { "epoch": 0.056889426850372106, "grad_norm": 1.1388757228851318, "learning_rate": 9.98112350587804e-06, "loss": 0.7618, "step": 282 }, { "epoch": 0.0570911624065791, "grad_norm": 0.5775352716445923, "learning_rate": 9.980838763813707e-06, "loss": 0.7496, "step": 283 }, { "epoch": 0.057292897962786096, "grad_norm": 1.036942481994629, "learning_rate": 9.980551894325793e-06, "loss": 0.857, "step": 284 }, { "epoch": 0.05749463351899309, "grad_norm": 1.5410163402557373, "learning_rate": 9.980262897536824e-06, "loss": 0.7326, "step": 285 }, { "epoch": 0.05769636907520008, "grad_norm": 0.9250368475914001, "learning_rate": 9.979971773570239e-06, "loss": 0.8455, "step": 286 }, { "epoch": 0.05789810463140707, "grad_norm": 0.8032963275909424, "learning_rate": 9.979678522550382e-06, "loss": 0.706, "step": 287 }, { "epoch": 0.05809984018761407, "grad_norm": 6.008584976196289, "learning_rate": 9.979383144602505e-06, "loss": 0.7733, "step": 288 }, { "epoch": 0.05830157574382106, "grad_norm": 1.6467902660369873, "learning_rate": 9.979085639852776e-06, "loss": 0.8366, "step": 289 }, { "epoch": 0.05850331130002805, "grad_norm": 5.2584404945373535, "learning_rate": 9.97878600842826e-06, "loss": 0.9207, "step": 290 }, { "epoch": 0.058705046856235045, "grad_norm": 1.9451853036880493, "learning_rate": 9.978484250456938e-06, "loss": 0.7263, "step": 291 }, { "epoch": 0.058906782412442044, "grad_norm": 0.9677664637565613, "learning_rate": 9.9781803660677e-06, "loss": 0.7551, "step": 292 }, { "epoch": 0.059108517968649035, "grad_norm": 2.3871355056762695, "learning_rate": 9.977874355390337e-06, "loss": 0.7409, "step": 293 }, { "epoch": 0.05931025352485603, "grad_norm": 2.6086199283599854, "learning_rate": 9.977566218555554e-06, "loss": 0.7638, "step": 294 }, { "epoch": 0.05951198908106302, "grad_norm": 2.170511484146118, "learning_rate": 9.977255955694967e-06, "loss": 0.7545, "step": 295 }, { "epoch": 0.05971372463727001, "grad_norm": 2.165513277053833, "learning_rate": 9.97694356694109e-06, "loss": 0.7556, "step": 296 }, { "epoch": 0.05991546019347701, "grad_norm": 1.170595407485962, "learning_rate": 9.976629052427353e-06, "loss": 0.7372, "step": 297 }, { "epoch": 0.060117195749684, "grad_norm": 1.5325140953063965, "learning_rate": 9.976312412288096e-06, "loss": 0.9574, "step": 298 }, { "epoch": 0.06031893130589099, "grad_norm": 1.3879196643829346, "learning_rate": 9.975993646658555e-06, "loss": 0.7547, "step": 299 }, { "epoch": 0.060520666862097984, "grad_norm": 0.6840426921844482, "learning_rate": 9.97567275567489e-06, "loss": 0.7828, "step": 300 }, { "epoch": 0.06072240241830498, "grad_norm": 0.6344276070594788, "learning_rate": 9.975349739474156e-06, "loss": 0.747, "step": 301 }, { "epoch": 0.060924137974511974, "grad_norm": 1.4871931076049805, "learning_rate": 9.975024598194318e-06, "loss": 1.0252, "step": 302 }, { "epoch": 0.061125873530718966, "grad_norm": 0.8229524493217468, "learning_rate": 9.974697331974255e-06, "loss": 0.8158, "step": 303 }, { "epoch": 0.06132760908692596, "grad_norm": 0.6888187527656555, "learning_rate": 9.974367940953748e-06, "loss": 0.9094, "step": 304 }, { "epoch": 0.061529344643132956, "grad_norm": 1.9582043886184692, "learning_rate": 9.974036425273487e-06, "loss": 0.7483, "step": 305 }, { "epoch": 0.06173108019933995, "grad_norm": 0.950177013874054, "learning_rate": 9.973702785075072e-06, "loss": 0.7646, "step": 306 }, { "epoch": 0.06193281575554694, "grad_norm": 1.6053050756454468, "learning_rate": 9.973367020501003e-06, "loss": 0.9114, "step": 307 }, { "epoch": 0.06213455131175393, "grad_norm": 0.6130600571632385, "learning_rate": 9.973029131694694e-06, "loss": 0.7913, "step": 308 }, { "epoch": 0.06233628686796092, "grad_norm": 1.4359782934188843, "learning_rate": 9.972689118800467e-06, "loss": 0.8401, "step": 309 }, { "epoch": 0.06253802242416792, "grad_norm": 0.8085805773735046, "learning_rate": 9.972346981963546e-06, "loss": 0.7986, "step": 310 }, { "epoch": 0.06273975798037491, "grad_norm": 2.3081631660461426, "learning_rate": 9.972002721330067e-06, "loss": 0.758, "step": 311 }, { "epoch": 0.0629414935365819, "grad_norm": 6.955173015594482, "learning_rate": 9.97165633704707e-06, "loss": 0.8835, "step": 312 }, { "epoch": 0.0631432290927889, "grad_norm": 42.62002182006836, "learning_rate": 9.971307829262504e-06, "loss": 0.7345, "step": 313 }, { "epoch": 0.06334496464899589, "grad_norm": 33.94930648803711, "learning_rate": 9.970957198125224e-06, "loss": 0.725, "step": 314 }, { "epoch": 0.06354670020520288, "grad_norm": 1.6801073551177979, "learning_rate": 9.97060444378499e-06, "loss": 0.7774, "step": 315 }, { "epoch": 0.06374843576140989, "grad_norm": 0.945922315120697, "learning_rate": 9.970249566392474e-06, "loss": 0.7666, "step": 316 }, { "epoch": 0.06395017131761688, "grad_norm": 0.5739346146583557, "learning_rate": 9.96989256609925e-06, "loss": 0.9993, "step": 317 }, { "epoch": 0.06415190687382387, "grad_norm": 2.0904643535614014, "learning_rate": 9.969533443057802e-06, "loss": 0.782, "step": 318 }, { "epoch": 0.06435364243003086, "grad_norm": 1.087174415588379, "learning_rate": 9.969172197421518e-06, "loss": 0.7468, "step": 319 }, { "epoch": 0.06455537798623785, "grad_norm": 1.0355284214019775, "learning_rate": 9.968808829344692e-06, "loss": 0.7677, "step": 320 }, { "epoch": 0.06475711354244484, "grad_norm": 1.3207528591156006, "learning_rate": 9.968443338982532e-06, "loss": 0.7234, "step": 321 }, { "epoch": 0.06495884909865184, "grad_norm": 4.863753795623779, "learning_rate": 9.96807572649114e-06, "loss": 0.8654, "step": 322 }, { "epoch": 0.06516058465485883, "grad_norm": 0.6006829738616943, "learning_rate": 9.967705992027537e-06, "loss": 0.8232, "step": 323 }, { "epoch": 0.06536232021106582, "grad_norm": 1.9106441736221313, "learning_rate": 9.96733413574964e-06, "loss": 0.7413, "step": 324 }, { "epoch": 0.06556405576727282, "grad_norm": 48.14797592163086, "learning_rate": 9.966960157816279e-06, "loss": 0.7532, "step": 325 }, { "epoch": 0.06576579132347982, "grad_norm": 47.85212707519531, "learning_rate": 9.96658405838719e-06, "loss": 0.7283, "step": 326 }, { "epoch": 0.06596752687968681, "grad_norm": 29.23980140686035, "learning_rate": 9.966205837623009e-06, "loss": 0.7057, "step": 327 }, { "epoch": 0.0661692624358938, "grad_norm": 3.5036604404449463, "learning_rate": 9.965825495685284e-06, "loss": 0.6846, "step": 328 }, { "epoch": 0.06637099799210079, "grad_norm": 0.6103178858757019, "learning_rate": 9.965443032736469e-06, "loss": 0.9649, "step": 329 }, { "epoch": 0.06657273354830778, "grad_norm": 1.7372721433639526, "learning_rate": 9.965058448939919e-06, "loss": 0.8735, "step": 330 }, { "epoch": 0.06677446910451477, "grad_norm": 0.6306778192520142, "learning_rate": 9.964671744459902e-06, "loss": 1.0855, "step": 331 }, { "epoch": 0.06697620466072177, "grad_norm": 1.587566614151001, "learning_rate": 9.964282919461584e-06, "loss": 0.7498, "step": 332 }, { "epoch": 0.06717794021692877, "grad_norm": 1.041245460510254, "learning_rate": 9.963891974111042e-06, "loss": 0.7863, "step": 333 }, { "epoch": 0.06737967577313576, "grad_norm": 0.8831943869590759, "learning_rate": 9.963498908575258e-06, "loss": 0.7402, "step": 334 }, { "epoch": 0.06758141132934276, "grad_norm": 1.5780138969421387, "learning_rate": 9.963103723022117e-06, "loss": 0.9024, "step": 335 }, { "epoch": 0.06778314688554975, "grad_norm": 0.7395191788673401, "learning_rate": 9.962706417620413e-06, "loss": 0.7374, "step": 336 }, { "epoch": 0.06798488244175674, "grad_norm": 1.0590225458145142, "learning_rate": 9.962306992539842e-06, "loss": 0.7621, "step": 337 }, { "epoch": 0.06818661799796373, "grad_norm": 0.5914890170097351, "learning_rate": 9.96190544795101e-06, "loss": 0.7071, "step": 338 }, { "epoch": 0.06838835355417072, "grad_norm": 1.8004124164581299, "learning_rate": 9.961501784025423e-06, "loss": 0.7526, "step": 339 }, { "epoch": 0.06859008911037771, "grad_norm": 1.0831773281097412, "learning_rate": 9.961096000935493e-06, "loss": 0.6934, "step": 340 }, { "epoch": 0.0687918246665847, "grad_norm": 0.832261323928833, "learning_rate": 9.960688098854542e-06, "loss": 0.7978, "step": 341 }, { "epoch": 0.06899356022279171, "grad_norm": 0.72775799036026, "learning_rate": 9.960278077956792e-06, "loss": 0.8441, "step": 342 }, { "epoch": 0.0691952957789987, "grad_norm": 2.9648144245147705, "learning_rate": 9.959865938417372e-06, "loss": 0.751, "step": 343 }, { "epoch": 0.0693970313352057, "grad_norm": 1.3026782274246216, "learning_rate": 9.959451680412316e-06, "loss": 0.7462, "step": 344 }, { "epoch": 0.06959876689141269, "grad_norm": 2.5709228515625, "learning_rate": 9.959035304118563e-06, "loss": 0.7466, "step": 345 }, { "epoch": 0.06980050244761968, "grad_norm": 2.0137813091278076, "learning_rate": 9.958616809713955e-06, "loss": 0.7713, "step": 346 }, { "epoch": 0.07000223800382667, "grad_norm": 1.1602307558059692, "learning_rate": 9.958196197377242e-06, "loss": 0.9285, "step": 347 }, { "epoch": 0.07020397356003366, "grad_norm": 0.5965341925621033, "learning_rate": 9.957773467288074e-06, "loss": 0.7451, "step": 348 }, { "epoch": 0.07040570911624065, "grad_norm": 0.5603690147399902, "learning_rate": 9.95734861962701e-06, "loss": 0.7483, "step": 349 }, { "epoch": 0.07060744467244764, "grad_norm": 1.2296565771102905, "learning_rate": 9.95692165457551e-06, "loss": 0.7167, "step": 350 }, { "epoch": 0.07080918022865465, "grad_norm": 1.3939180374145508, "learning_rate": 9.95649257231594e-06, "loss": 0.7404, "step": 351 }, { "epoch": 0.07101091578486164, "grad_norm": 0.8631527423858643, "learning_rate": 9.956061373031573e-06, "loss": 0.807, "step": 352 }, { "epoch": 0.07121265134106863, "grad_norm": 0.6659342050552368, "learning_rate": 9.955628056906584e-06, "loss": 0.6811, "step": 353 }, { "epoch": 0.07141438689727563, "grad_norm": 0.7338282465934753, "learning_rate": 9.955192624126045e-06, "loss": 0.721, "step": 354 }, { "epoch": 0.07161612245348262, "grad_norm": 0.7555668950080872, "learning_rate": 9.954755074875946e-06, "loss": 0.8442, "step": 355 }, { "epoch": 0.07181785800968961, "grad_norm": 1.4037517309188843, "learning_rate": 9.95431540934317e-06, "loss": 0.7371, "step": 356 }, { "epoch": 0.0720195935658966, "grad_norm": 1.6213260889053345, "learning_rate": 9.953873627715506e-06, "loss": 0.7541, "step": 357 }, { "epoch": 0.07222132912210359, "grad_norm": 2.8973515033721924, "learning_rate": 9.953429730181653e-06, "loss": 0.7394, "step": 358 }, { "epoch": 0.0724230646783106, "grad_norm": 1.1352715492248535, "learning_rate": 9.952983716931209e-06, "loss": 0.6834, "step": 359 }, { "epoch": 0.07262480023451759, "grad_norm": 0.7093841433525085, "learning_rate": 9.952535588154673e-06, "loss": 0.75, "step": 360 }, { "epoch": 0.07282653579072458, "grad_norm": 1.4455276727676392, "learning_rate": 9.95208534404345e-06, "loss": 0.7095, "step": 361 }, { "epoch": 0.07302827134693157, "grad_norm": 0.5514054894447327, "learning_rate": 9.951632984789851e-06, "loss": 0.7518, "step": 362 }, { "epoch": 0.07323000690313856, "grad_norm": 2.37100887298584, "learning_rate": 9.951178510587087e-06, "loss": 0.7373, "step": 363 }, { "epoch": 0.07343174245934556, "grad_norm": 0.6046085953712463, "learning_rate": 9.950721921629276e-06, "loss": 0.7272, "step": 364 }, { "epoch": 0.07363347801555255, "grad_norm": 0.8679599165916443, "learning_rate": 9.950263218111435e-06, "loss": 0.6825, "step": 365 }, { "epoch": 0.07383521357175954, "grad_norm": 1.1433424949645996, "learning_rate": 9.949802400229486e-06, "loss": 0.7595, "step": 366 }, { "epoch": 0.07403694912796653, "grad_norm": 0.6897748708724976, "learning_rate": 9.949339468180256e-06, "loss": 0.8082, "step": 367 }, { "epoch": 0.07423868468417354, "grad_norm": 0.674286425113678, "learning_rate": 9.948874422161473e-06, "loss": 0.8014, "step": 368 }, { "epoch": 0.07444042024038053, "grad_norm": 0.6043882966041565, "learning_rate": 9.948407262371764e-06, "loss": 0.7644, "step": 369 }, { "epoch": 0.07464215579658752, "grad_norm": 1.9822863340377808, "learning_rate": 9.947937989010668e-06, "loss": 0.7242, "step": 370 }, { "epoch": 0.07484389135279451, "grad_norm": 0.5006752610206604, "learning_rate": 9.947466602278621e-06, "loss": 0.7257, "step": 371 }, { "epoch": 0.0750456269090015, "grad_norm": 1.2581019401550293, "learning_rate": 9.946993102376961e-06, "loss": 1.0325, "step": 372 }, { "epoch": 0.0752473624652085, "grad_norm": 0.8816624879837036, "learning_rate": 9.94651748950793e-06, "loss": 0.9296, "step": 373 }, { "epoch": 0.07544909802141549, "grad_norm": 0.9441580772399902, "learning_rate": 9.946039763874674e-06, "loss": 0.7625, "step": 374 }, { "epoch": 0.07565083357762248, "grad_norm": 0.8706744909286499, "learning_rate": 9.945559925681238e-06, "loss": 0.7524, "step": 375 }, { "epoch": 0.07585256913382947, "grad_norm": 0.7602512836456299, "learning_rate": 9.945077975132573e-06, "loss": 0.7468, "step": 376 }, { "epoch": 0.07605430469003648, "grad_norm": 0.7464005947113037, "learning_rate": 9.94459391243453e-06, "loss": 0.7798, "step": 377 }, { "epoch": 0.07625604024624347, "grad_norm": 5.947686195373535, "learning_rate": 9.944107737793862e-06, "loss": 0.7422, "step": 378 }, { "epoch": 0.07645777580245046, "grad_norm": 11.784186363220215, "learning_rate": 9.943619451418225e-06, "loss": 0.7596, "step": 379 }, { "epoch": 0.07665951135865745, "grad_norm": 2.201178550720215, "learning_rate": 9.943129053516176e-06, "loss": 0.7318, "step": 380 }, { "epoch": 0.07686124691486444, "grad_norm": 1.0005850791931152, "learning_rate": 9.942636544297175e-06, "loss": 0.7558, "step": 381 }, { "epoch": 0.07706298247107143, "grad_norm": 0.6544851064682007, "learning_rate": 9.942141923971584e-06, "loss": 0.7184, "step": 382 }, { "epoch": 0.07726471802727843, "grad_norm": 1.1898512840270996, "learning_rate": 9.941645192750665e-06, "loss": 0.7013, "step": 383 }, { "epoch": 0.07746645358348542, "grad_norm": 0.9880014061927795, "learning_rate": 9.941146350846583e-06, "loss": 0.7444, "step": 384 }, { "epoch": 0.07766818913969242, "grad_norm": 0.6566327214241028, "learning_rate": 9.940645398472405e-06, "loss": 0.748, "step": 385 }, { "epoch": 0.07786992469589941, "grad_norm": 3.9069302082061768, "learning_rate": 9.940142335842097e-06, "loss": 0.7627, "step": 386 }, { "epoch": 0.0780716602521064, "grad_norm": 0.8036094307899475, "learning_rate": 9.939637163170528e-06, "loss": 0.9697, "step": 387 }, { "epoch": 0.0782733958083134, "grad_norm": 0.5197206139564514, "learning_rate": 9.939129880673471e-06, "loss": 0.7418, "step": 388 }, { "epoch": 0.07847513136452039, "grad_norm": 0.5764813423156738, "learning_rate": 9.938620488567592e-06, "loss": 0.8098, "step": 389 }, { "epoch": 0.07867686692072738, "grad_norm": 4.1930832862854, "learning_rate": 9.938108987070467e-06, "loss": 0.796, "step": 390 }, { "epoch": 0.07887860247693437, "grad_norm": 1.410750389099121, "learning_rate": 9.93759537640057e-06, "loss": 0.7219, "step": 391 }, { "epoch": 0.07908033803314136, "grad_norm": 0.6416834592819214, "learning_rate": 9.937079656777275e-06, "loss": 0.7999, "step": 392 }, { "epoch": 0.07928207358934836, "grad_norm": 0.6255632042884827, "learning_rate": 9.936561828420854e-06, "loss": 0.7706, "step": 393 }, { "epoch": 0.07948380914555536, "grad_norm": 2.998563051223755, "learning_rate": 9.936041891552484e-06, "loss": 0.7554, "step": 394 }, { "epoch": 0.07968554470176235, "grad_norm": 1.068771481513977, "learning_rate": 9.935519846394242e-06, "loss": 0.7692, "step": 395 }, { "epoch": 0.07988728025796935, "grad_norm": 1.0073810815811157, "learning_rate": 9.934995693169104e-06, "loss": 0.7764, "step": 396 }, { "epoch": 0.08008901581417634, "grad_norm": 2.54864501953125, "learning_rate": 9.93446943210095e-06, "loss": 0.7399, "step": 397 }, { "epoch": 0.08029075137038333, "grad_norm": 1.4615826606750488, "learning_rate": 9.933941063414553e-06, "loss": 0.749, "step": 398 }, { "epoch": 0.08049248692659032, "grad_norm": 2.118624448776245, "learning_rate": 9.933410587335594e-06, "loss": 0.6639, "step": 399 }, { "epoch": 0.08069422248279731, "grad_norm": 1.0449012517929077, "learning_rate": 9.93287800409065e-06, "loss": 0.7129, "step": 400 }, { "epoch": 0.0808959580390043, "grad_norm": 0.46440356969833374, "learning_rate": 9.932343313907196e-06, "loss": 0.7212, "step": 401 }, { "epoch": 0.0810976935952113, "grad_norm": 1.939047932624817, "learning_rate": 9.931806517013612e-06, "loss": 0.7696, "step": 402 }, { "epoch": 0.0812994291514183, "grad_norm": 0.7793079018592834, "learning_rate": 9.931267613639177e-06, "loss": 0.7481, "step": 403 }, { "epoch": 0.08150116470762529, "grad_norm": 0.884232223033905, "learning_rate": 9.930726604014066e-06, "loss": 0.8048, "step": 404 }, { "epoch": 0.08170290026383228, "grad_norm": 1.6912051439285278, "learning_rate": 9.930183488369357e-06, "loss": 0.8251, "step": 405 }, { "epoch": 0.08190463582003928, "grad_norm": 0.6897804737091064, "learning_rate": 9.929638266937025e-06, "loss": 0.7105, "step": 406 }, { "epoch": 0.08210637137624627, "grad_norm": 1.0087461471557617, "learning_rate": 9.929090939949948e-06, "loss": 0.7334, "step": 407 }, { "epoch": 0.08230810693245326, "grad_norm": 1.2530308961868286, "learning_rate": 9.9285415076419e-06, "loss": 0.7001, "step": 408 }, { "epoch": 0.08250984248866025, "grad_norm": 0.5179945230484009, "learning_rate": 9.927989970247554e-06, "loss": 0.749, "step": 409 }, { "epoch": 0.08271157804486724, "grad_norm": 0.757422685623169, "learning_rate": 9.927436328002487e-06, "loss": 0.685, "step": 410 }, { "epoch": 0.08291331360107425, "grad_norm": 1.2727844715118408, "learning_rate": 9.926880581143168e-06, "loss": 0.8697, "step": 411 }, { "epoch": 0.08311504915728124, "grad_norm": 0.7067129611968994, "learning_rate": 9.926322729906968e-06, "loss": 0.7317, "step": 412 }, { "epoch": 0.08331678471348823, "grad_norm": 7.245047569274902, "learning_rate": 9.925762774532162e-06, "loss": 0.7562, "step": 413 }, { "epoch": 0.08351852026969522, "grad_norm": 1.140660047531128, "learning_rate": 9.925200715257915e-06, "loss": 0.7611, "step": 414 }, { "epoch": 0.08372025582590222, "grad_norm": 0.8195030093193054, "learning_rate": 9.924636552324296e-06, "loss": 0.7434, "step": 415 }, { "epoch": 0.0839219913821092, "grad_norm": 0.5836001634597778, "learning_rate": 9.92407028597227e-06, "loss": 0.7571, "step": 416 }, { "epoch": 0.0841237269383162, "grad_norm": 0.7573882341384888, "learning_rate": 9.923501916443704e-06, "loss": 0.7345, "step": 417 }, { "epoch": 0.08432546249452319, "grad_norm": 0.625736653804779, "learning_rate": 9.922931443981358e-06, "loss": 0.8219, "step": 418 }, { "epoch": 0.08452719805073018, "grad_norm": 0.9961444139480591, "learning_rate": 9.922358868828896e-06, "loss": 0.7045, "step": 419 }, { "epoch": 0.08472893360693719, "grad_norm": 0.8629128932952881, "learning_rate": 9.921784191230874e-06, "loss": 0.7361, "step": 420 }, { "epoch": 0.08493066916314418, "grad_norm": 0.8115153908729553, "learning_rate": 9.921207411432752e-06, "loss": 0.7315, "step": 421 }, { "epoch": 0.08513240471935117, "grad_norm": 1.0216450691223145, "learning_rate": 9.920628529680882e-06, "loss": 0.7691, "step": 422 }, { "epoch": 0.08533414027555816, "grad_norm": 0.8689268827438354, "learning_rate": 9.920047546222522e-06, "loss": 0.8783, "step": 423 }, { "epoch": 0.08553587583176515, "grad_norm": 1.2929258346557617, "learning_rate": 9.919464461305817e-06, "loss": 0.7367, "step": 424 }, { "epoch": 0.08573761138797215, "grad_norm": 0.7794530987739563, "learning_rate": 9.918879275179819e-06, "loss": 0.7366, "step": 425 }, { "epoch": 0.08593934694417914, "grad_norm": 2.4730234146118164, "learning_rate": 9.91829198809447e-06, "loss": 0.8005, "step": 426 }, { "epoch": 0.08614108250038613, "grad_norm": 2.03058123588562, "learning_rate": 9.917702600300615e-06, "loss": 0.7365, "step": 427 }, { "epoch": 0.08634281805659313, "grad_norm": 0.6619600653648376, "learning_rate": 9.917111112049996e-06, "loss": 0.718, "step": 428 }, { "epoch": 0.08654455361280013, "grad_norm": 1.198303461074829, "learning_rate": 9.916517523595248e-06, "loss": 0.9309, "step": 429 }, { "epoch": 0.08674628916900712, "grad_norm": 0.6509268879890442, "learning_rate": 9.915921835189906e-06, "loss": 0.7377, "step": 430 }, { "epoch": 0.08694802472521411, "grad_norm": 0.9966859817504883, "learning_rate": 9.915324047088402e-06, "loss": 0.716, "step": 431 }, { "epoch": 0.0871497602814211, "grad_norm": 2.054065227508545, "learning_rate": 9.914724159546063e-06, "loss": 0.7177, "step": 432 }, { "epoch": 0.0873514958376281, "grad_norm": 1.0520039796829224, "learning_rate": 9.914122172819113e-06, "loss": 0.6613, "step": 433 }, { "epoch": 0.08755323139383508, "grad_norm": 0.4894033670425415, "learning_rate": 9.913518087164678e-06, "loss": 0.7384, "step": 434 }, { "epoch": 0.08775496695004208, "grad_norm": 0.550117552280426, "learning_rate": 9.912911902840771e-06, "loss": 0.7757, "step": 435 }, { "epoch": 0.08795670250624907, "grad_norm": 0.8945670127868652, "learning_rate": 9.91230362010631e-06, "loss": 0.6971, "step": 436 }, { "epoch": 0.08815843806245607, "grad_norm": 0.4656491279602051, "learning_rate": 9.911693239221101e-06, "loss": 0.7452, "step": 437 }, { "epoch": 0.08836017361866307, "grad_norm": 0.7141634225845337, "learning_rate": 9.911080760445857e-06, "loss": 0.8922, "step": 438 }, { "epoch": 0.08856190917487006, "grad_norm": 3.367650032043457, "learning_rate": 9.910466184042177e-06, "loss": 0.7144, "step": 439 }, { "epoch": 0.08876364473107705, "grad_norm": 0.4969181716442108, "learning_rate": 9.90984951027256e-06, "loss": 0.741, "step": 440 }, { "epoch": 0.08896538028728404, "grad_norm": 0.9121612906455994, "learning_rate": 9.909230739400402e-06, "loss": 0.8275, "step": 441 }, { "epoch": 0.08916711584349103, "grad_norm": 0.5296851992607117, "learning_rate": 9.908609871689992e-06, "loss": 0.6998, "step": 442 }, { "epoch": 0.08936885139969802, "grad_norm": 0.6200650334358215, "learning_rate": 9.907986907406517e-06, "loss": 0.6965, "step": 443 }, { "epoch": 0.08957058695590502, "grad_norm": 0.9386738538742065, "learning_rate": 9.907361846816057e-06, "loss": 0.8557, "step": 444 }, { "epoch": 0.08977232251211201, "grad_norm": 0.6921373009681702, "learning_rate": 9.90673469018559e-06, "loss": 0.7537, "step": 445 }, { "epoch": 0.08997405806831901, "grad_norm": 1.0132421255111694, "learning_rate": 9.90610543778299e-06, "loss": 0.7246, "step": 446 }, { "epoch": 0.090175793624526, "grad_norm": 3.3522820472717285, "learning_rate": 9.90547408987702e-06, "loss": 0.7415, "step": 447 }, { "epoch": 0.090377529180733, "grad_norm": 4.449716091156006, "learning_rate": 9.904840646737346e-06, "loss": 0.8019, "step": 448 }, { "epoch": 0.09057926473693999, "grad_norm": 3.593144416809082, "learning_rate": 9.904205108634525e-06, "loss": 0.6708, "step": 449 }, { "epoch": 0.09078100029314698, "grad_norm": 9.274864196777344, "learning_rate": 9.903567475840005e-06, "loss": 0.7104, "step": 450 }, { "epoch": 0.09098273584935397, "grad_norm": 3.4164490699768066, "learning_rate": 9.902927748626139e-06, "loss": 0.8612, "step": 451 }, { "epoch": 0.09118447140556096, "grad_norm": 1.1139180660247803, "learning_rate": 9.902285927266162e-06, "loss": 0.7185, "step": 452 }, { "epoch": 0.09138620696176795, "grad_norm": 1.6076403856277466, "learning_rate": 9.901642012034214e-06, "loss": 0.7423, "step": 453 }, { "epoch": 0.09158794251797496, "grad_norm": 0.48474082350730896, "learning_rate": 9.900996003205323e-06, "loss": 0.6703, "step": 454 }, { "epoch": 0.09178967807418195, "grad_norm": 17.713228225708008, "learning_rate": 9.900347901055414e-06, "loss": 0.8088, "step": 455 }, { "epoch": 0.09199141363038894, "grad_norm": 21.032386779785156, "learning_rate": 9.899697705861304e-06, "loss": 0.7579, "step": 456 }, { "epoch": 0.09219314918659594, "grad_norm": 2.4759697914123535, "learning_rate": 9.899045417900709e-06, "loss": 0.8178, "step": 457 }, { "epoch": 0.09239488474280293, "grad_norm": 0.45519858598709106, "learning_rate": 9.898391037452231e-06, "loss": 0.7225, "step": 458 }, { "epoch": 0.09259662029900992, "grad_norm": 1.385945200920105, "learning_rate": 9.897734564795374e-06, "loss": 0.8623, "step": 459 }, { "epoch": 0.09279835585521691, "grad_norm": 0.9438928961753845, "learning_rate": 9.897076000210528e-06, "loss": 0.7484, "step": 460 }, { "epoch": 0.0930000914114239, "grad_norm": 0.42515015602111816, "learning_rate": 9.896415343978982e-06, "loss": 0.7168, "step": 461 }, { "epoch": 0.0932018269676309, "grad_norm": 0.8145204782485962, "learning_rate": 9.895752596382916e-06, "loss": 0.7254, "step": 462 }, { "epoch": 0.0934035625238379, "grad_norm": 0.9704925417900085, "learning_rate": 9.895087757705406e-06, "loss": 0.741, "step": 463 }, { "epoch": 0.09360529808004489, "grad_norm": 0.5286685824394226, "learning_rate": 9.894420828230416e-06, "loss": 0.745, "step": 464 }, { "epoch": 0.09380703363625188, "grad_norm": 1.0488917827606201, "learning_rate": 9.893751808242805e-06, "loss": 0.702, "step": 465 }, { "epoch": 0.09400876919245887, "grad_norm": 0.8315796256065369, "learning_rate": 9.89308069802833e-06, "loss": 0.9462, "step": 466 }, { "epoch": 0.09421050474866587, "grad_norm": 0.5982228517532349, "learning_rate": 9.892407497873633e-06, "loss": 0.7748, "step": 467 }, { "epoch": 0.09441224030487286, "grad_norm": 11.72299575805664, "learning_rate": 9.891732208066254e-06, "loss": 0.722, "step": 468 }, { "epoch": 0.09461397586107985, "grad_norm": 0.6634166836738586, "learning_rate": 9.891054828894624e-06, "loss": 0.6943, "step": 469 }, { "epoch": 0.09481571141728684, "grad_norm": 0.5095616579055786, "learning_rate": 9.890375360648065e-06, "loss": 0.6962, "step": 470 }, { "epoch": 0.09501744697349383, "grad_norm": 0.49047261476516724, "learning_rate": 9.889693803616793e-06, "loss": 0.969, "step": 471 }, { "epoch": 0.09521918252970084, "grad_norm": 1.026039958000183, "learning_rate": 9.889010158091917e-06, "loss": 0.7256, "step": 472 }, { "epoch": 0.09542091808590783, "grad_norm": 0.9997348189353943, "learning_rate": 9.888324424365435e-06, "loss": 0.7145, "step": 473 }, { "epoch": 0.09562265364211482, "grad_norm": 2.0080642700195312, "learning_rate": 9.88763660273024e-06, "loss": 1.0393, "step": 474 }, { "epoch": 0.09582438919832181, "grad_norm": 0.9626255631446838, "learning_rate": 9.886946693480114e-06, "loss": 0.749, "step": 475 }, { "epoch": 0.0960261247545288, "grad_norm": 0.7559981346130371, "learning_rate": 9.886254696909733e-06, "loss": 0.7095, "step": 476 }, { "epoch": 0.0962278603107358, "grad_norm": 0.5458871722221375, "learning_rate": 9.885560613314664e-06, "loss": 0.7313, "step": 477 }, { "epoch": 0.09642959586694279, "grad_norm": 0.4658026397228241, "learning_rate": 9.884864442991364e-06, "loss": 0.7166, "step": 478 }, { "epoch": 0.09663133142314978, "grad_norm": 0.5284778475761414, "learning_rate": 9.884166186237185e-06, "loss": 0.821, "step": 479 }, { "epoch": 0.09683306697935679, "grad_norm": 1.8951740264892578, "learning_rate": 9.883465843350364e-06, "loss": 0.6991, "step": 480 }, { "epoch": 0.09703480253556378, "grad_norm": 0.44958412647247314, "learning_rate": 9.882763414630033e-06, "loss": 0.7198, "step": 481 }, { "epoch": 0.09723653809177077, "grad_norm": 0.8649196624755859, "learning_rate": 9.882058900376218e-06, "loss": 0.7282, "step": 482 }, { "epoch": 0.09743827364797776, "grad_norm": 1.0441064834594727, "learning_rate": 9.881352300889825e-06, "loss": 0.8577, "step": 483 }, { "epoch": 0.09764000920418475, "grad_norm": 0.6799788475036621, "learning_rate": 9.880643616472667e-06, "loss": 0.715, "step": 484 }, { "epoch": 0.09784174476039174, "grad_norm": 0.4560524523258209, "learning_rate": 9.879932847427432e-06, "loss": 0.7117, "step": 485 }, { "epoch": 0.09804348031659874, "grad_norm": 0.7565090656280518, "learning_rate": 9.879219994057706e-06, "loss": 0.7229, "step": 486 }, { "epoch": 0.09824521587280573, "grad_norm": 0.5884823203086853, "learning_rate": 9.878505056667967e-06, "loss": 0.7408, "step": 487 }, { "epoch": 0.09844695142901272, "grad_norm": 0.7502923011779785, "learning_rate": 9.877788035563577e-06, "loss": 0.7072, "step": 488 }, { "epoch": 0.09864868698521972, "grad_norm": 0.611727774143219, "learning_rate": 9.877068931050792e-06, "loss": 0.9156, "step": 489 }, { "epoch": 0.09885042254142672, "grad_norm": 0.5064416527748108, "learning_rate": 9.876347743436758e-06, "loss": 0.7025, "step": 490 }, { "epoch": 0.09905215809763371, "grad_norm": 0.6777665615081787, "learning_rate": 9.875624473029508e-06, "loss": 0.6648, "step": 491 }, { "epoch": 0.0992538936538407, "grad_norm": 2.0562150478363037, "learning_rate": 9.874899120137968e-06, "loss": 0.7049, "step": 492 }, { "epoch": 0.09945562921004769, "grad_norm": 3.049912214279175, "learning_rate": 9.874171685071949e-06, "loss": 0.8232, "step": 493 }, { "epoch": 0.09965736476625468, "grad_norm": 1.2710695266723633, "learning_rate": 9.873442168142158e-06, "loss": 0.8825, "step": 494 }, { "epoch": 0.09985910032246167, "grad_norm": 0.7639725208282471, "learning_rate": 9.872710569660186e-06, "loss": 0.7314, "step": 495 }, { "epoch": 0.10006083587866867, "grad_norm": 0.9314205646514893, "learning_rate": 9.871976889938514e-06, "loss": 0.7111, "step": 496 }, { "epoch": 0.10026257143487566, "grad_norm": 0.8991636037826538, "learning_rate": 9.871241129290511e-06, "loss": 0.7147, "step": 497 }, { "epoch": 0.10046430699108266, "grad_norm": 1.5229334831237793, "learning_rate": 9.870503288030441e-06, "loss": 0.7507, "step": 498 }, { "epoch": 0.10066604254728966, "grad_norm": 1.4551448822021484, "learning_rate": 9.869763366473447e-06, "loss": 0.7372, "step": 499 }, { "epoch": 0.10086777810349665, "grad_norm": 2.448296070098877, "learning_rate": 9.869021364935567e-06, "loss": 0.6818, "step": 500 }, { "epoch": 0.10106951365970364, "grad_norm": 0.8766042590141296, "learning_rate": 9.868277283733725e-06, "loss": 0.7068, "step": 501 }, { "epoch": 0.10127124921591063, "grad_norm": 0.6733140349388123, "learning_rate": 9.867531123185738e-06, "loss": 0.7211, "step": 502 }, { "epoch": 0.10147298477211762, "grad_norm": 0.904449999332428, "learning_rate": 9.866782883610302e-06, "loss": 0.8696, "step": 503 }, { "epoch": 0.10167472032832461, "grad_norm": 0.6029015779495239, "learning_rate": 9.86603256532701e-06, "loss": 0.7392, "step": 504 }, { "epoch": 0.1018764558845316, "grad_norm": 0.4247641861438751, "learning_rate": 9.865280168656337e-06, "loss": 0.683, "step": 505 }, { "epoch": 0.10207819144073861, "grad_norm": 0.6710717678070068, "learning_rate": 9.864525693919648e-06, "loss": 0.7417, "step": 506 }, { "epoch": 0.1022799269969456, "grad_norm": 0.6687069535255432, "learning_rate": 9.863769141439199e-06, "loss": 0.8465, "step": 507 }, { "epoch": 0.1024816625531526, "grad_norm": 0.9954400062561035, "learning_rate": 9.863010511538124e-06, "loss": 0.6569, "step": 508 }, { "epoch": 0.10268339810935959, "grad_norm": 0.5977325439453125, "learning_rate": 9.862249804540453e-06, "loss": 0.7459, "step": 509 }, { "epoch": 0.10288513366556658, "grad_norm": 0.7987101674079895, "learning_rate": 9.861487020771103e-06, "loss": 0.7721, "step": 510 }, { "epoch": 0.10308686922177357, "grad_norm": 1.4089206457138062, "learning_rate": 9.860722160555872e-06, "loss": 0.7118, "step": 511 }, { "epoch": 0.10328860477798056, "grad_norm": 0.8175768852233887, "learning_rate": 9.859955224221446e-06, "loss": 0.8256, "step": 512 }, { "epoch": 0.10349034033418755, "grad_norm": 0.778608500957489, "learning_rate": 9.859186212095405e-06, "loss": 0.6965, "step": 513 }, { "epoch": 0.10369207589039454, "grad_norm": 0.4597846269607544, "learning_rate": 9.858415124506211e-06, "loss": 0.7308, "step": 514 }, { "epoch": 0.10389381144660155, "grad_norm": 0.8675187826156616, "learning_rate": 9.857641961783207e-06, "loss": 0.6938, "step": 515 }, { "epoch": 0.10409554700280854, "grad_norm": 1.7470319271087646, "learning_rate": 9.856866724256634e-06, "loss": 0.6957, "step": 516 }, { "epoch": 0.10429728255901553, "grad_norm": 0.8632996082305908, "learning_rate": 9.856089412257605e-06, "loss": 0.8682, "step": 517 }, { "epoch": 0.10449901811522253, "grad_norm": 1.298071026802063, "learning_rate": 9.855310026118132e-06, "loss": 0.9162, "step": 518 }, { "epoch": 0.10470075367142952, "grad_norm": 0.5649451613426208, "learning_rate": 9.854528566171106e-06, "loss": 0.7407, "step": 519 }, { "epoch": 0.10490248922763651, "grad_norm": 3.4036266803741455, "learning_rate": 9.853745032750309e-06, "loss": 0.7086, "step": 520 }, { "epoch": 0.1051042247838435, "grad_norm": 0.46299856901168823, "learning_rate": 9.852959426190399e-06, "loss": 0.7751, "step": 521 }, { "epoch": 0.10530596034005049, "grad_norm": 1.2342002391815186, "learning_rate": 9.852171746826928e-06, "loss": 0.76, "step": 522 }, { "epoch": 0.10550769589625748, "grad_norm": 2.244950771331787, "learning_rate": 9.85138199499633e-06, "loss": 0.6863, "step": 523 }, { "epoch": 0.10570943145246449, "grad_norm": 2.6959030628204346, "learning_rate": 9.850590171035928e-06, "loss": 0.7939, "step": 524 }, { "epoch": 0.10591116700867148, "grad_norm": 2.4526679515838623, "learning_rate": 9.849796275283925e-06, "loss": 1.0101, "step": 525 }, { "epoch": 0.10611290256487847, "grad_norm": 0.7242956161499023, "learning_rate": 9.849000308079412e-06, "loss": 0.7311, "step": 526 }, { "epoch": 0.10631463812108546, "grad_norm": 0.6420136094093323, "learning_rate": 9.84820226976236e-06, "loss": 0.7165, "step": 527 }, { "epoch": 0.10651637367729246, "grad_norm": 0.5252360105514526, "learning_rate": 9.847402160673634e-06, "loss": 0.702, "step": 528 }, { "epoch": 0.10671810923349945, "grad_norm": 2.2276649475097656, "learning_rate": 9.846599981154975e-06, "loss": 0.8515, "step": 529 }, { "epoch": 0.10691984478970644, "grad_norm": 1.01405668258667, "learning_rate": 9.84579573154901e-06, "loss": 0.7093, "step": 530 }, { "epoch": 0.10712158034591343, "grad_norm": 0.48206350207328796, "learning_rate": 9.844989412199254e-06, "loss": 0.7038, "step": 531 }, { "epoch": 0.10732331590212044, "grad_norm": 0.7908836007118225, "learning_rate": 9.844181023450101e-06, "loss": 0.7945, "step": 532 }, { "epoch": 0.10752505145832743, "grad_norm": 0.42727962136268616, "learning_rate": 9.843370565646833e-06, "loss": 0.6819, "step": 533 }, { "epoch": 0.10772678701453442, "grad_norm": 0.5153601765632629, "learning_rate": 9.842558039135612e-06, "loss": 0.7405, "step": 534 }, { "epoch": 0.10792852257074141, "grad_norm": 0.7418642640113831, "learning_rate": 9.841743444263489e-06, "loss": 0.7064, "step": 535 }, { "epoch": 0.1081302581269484, "grad_norm": 0.6575508117675781, "learning_rate": 9.84092678137839e-06, "loss": 0.9104, "step": 536 }, { "epoch": 0.1083319936831554, "grad_norm": 1.157667875289917, "learning_rate": 9.840108050829135e-06, "loss": 0.7568, "step": 537 }, { "epoch": 0.10853372923936239, "grad_norm": 0.6192544102668762, "learning_rate": 9.839287252965418e-06, "loss": 0.8135, "step": 538 }, { "epoch": 0.10873546479556938, "grad_norm": 0.5880550742149353, "learning_rate": 9.838464388137819e-06, "loss": 0.7347, "step": 539 }, { "epoch": 0.10893720035177637, "grad_norm": 0.7463414669036865, "learning_rate": 9.837639456697802e-06, "loss": 0.8709, "step": 540 }, { "epoch": 0.10913893590798338, "grad_norm": 0.644129753112793, "learning_rate": 9.836812458997715e-06, "loss": 0.7348, "step": 541 }, { "epoch": 0.10934067146419037, "grad_norm": 1.0127805471420288, "learning_rate": 9.835983395390784e-06, "loss": 0.8324, "step": 542 }, { "epoch": 0.10954240702039736, "grad_norm": 0.7811554074287415, "learning_rate": 9.835152266231121e-06, "loss": 0.8713, "step": 543 }, { "epoch": 0.10974414257660435, "grad_norm": 1.3340731859207153, "learning_rate": 9.834319071873719e-06, "loss": 0.8607, "step": 544 }, { "epoch": 0.10994587813281134, "grad_norm": 0.8164252042770386, "learning_rate": 9.833483812674453e-06, "loss": 0.7775, "step": 545 }, { "epoch": 0.11014761368901833, "grad_norm": 0.535208523273468, "learning_rate": 9.832646488990081e-06, "loss": 0.7571, "step": 546 }, { "epoch": 0.11034934924522533, "grad_norm": 0.4087328016757965, "learning_rate": 9.831807101178242e-06, "loss": 0.7241, "step": 547 }, { "epoch": 0.11055108480143232, "grad_norm": 0.5368490219116211, "learning_rate": 9.830965649597455e-06, "loss": 0.8102, "step": 548 }, { "epoch": 0.11075282035763931, "grad_norm": 1.0589547157287598, "learning_rate": 9.830122134607125e-06, "loss": 0.7413, "step": 549 }, { "epoch": 0.11095455591384631, "grad_norm": 0.6054285168647766, "learning_rate": 9.82927655656753e-06, "loss": 0.7119, "step": 550 }, { "epoch": 0.1111562914700533, "grad_norm": 0.45449957251548767, "learning_rate": 9.828428915839843e-06, "loss": 0.8459, "step": 551 }, { "epoch": 0.1113580270262603, "grad_norm": 0.6329957246780396, "learning_rate": 9.827579212786103e-06, "loss": 1.019, "step": 552 }, { "epoch": 0.11155976258246729, "grad_norm": 0.724990963935852, "learning_rate": 9.826727447769237e-06, "loss": 0.6923, "step": 553 }, { "epoch": 0.11176149813867428, "grad_norm": 0.9215730428695679, "learning_rate": 9.825873621153055e-06, "loss": 0.7808, "step": 554 }, { "epoch": 0.11196323369488127, "grad_norm": 0.5018446445465088, "learning_rate": 9.825017733302241e-06, "loss": 0.7438, "step": 555 }, { "epoch": 0.11216496925108826, "grad_norm": 0.8885820508003235, "learning_rate": 9.82415978458237e-06, "loss": 0.7674, "step": 556 }, { "epoch": 0.11236670480729526, "grad_norm": 0.8014993071556091, "learning_rate": 9.823299775359882e-06, "loss": 0.7115, "step": 557 }, { "epoch": 0.11256844036350226, "grad_norm": 0.4157255291938782, "learning_rate": 9.82243770600211e-06, "loss": 0.7158, "step": 558 }, { "epoch": 0.11277017591970925, "grad_norm": 0.8874197006225586, "learning_rate": 9.821573576877264e-06, "loss": 0.7286, "step": 559 }, { "epoch": 0.11297191147591625, "grad_norm": 0.4824160933494568, "learning_rate": 9.820707388354428e-06, "loss": 0.6904, "step": 560 }, { "epoch": 0.11317364703212324, "grad_norm": 0.44073861837387085, "learning_rate": 9.819839140803571e-06, "loss": 0.7417, "step": 561 }, { "epoch": 0.11337538258833023, "grad_norm": 0.7926369905471802, "learning_rate": 9.818968834595544e-06, "loss": 0.7159, "step": 562 }, { "epoch": 0.11357711814453722, "grad_norm": 0.503059446811676, "learning_rate": 9.818096470102067e-06, "loss": 0.7917, "step": 563 }, { "epoch": 0.11377885370074421, "grad_norm": 0.7098116874694824, "learning_rate": 9.817222047695751e-06, "loss": 0.7545, "step": 564 }, { "epoch": 0.1139805892569512, "grad_norm": 1.5219429731369019, "learning_rate": 9.816345567750078e-06, "loss": 0.8778, "step": 565 }, { "epoch": 0.1141823248131582, "grad_norm": 0.6335718631744385, "learning_rate": 9.815467030639414e-06, "loss": 0.8104, "step": 566 }, { "epoch": 0.1143840603693652, "grad_norm": 1.10847806930542, "learning_rate": 9.814586436738998e-06, "loss": 0.7334, "step": 567 }, { "epoch": 0.11458579592557219, "grad_norm": 0.5866941809654236, "learning_rate": 9.81370378642495e-06, "loss": 0.7219, "step": 568 }, { "epoch": 0.11478753148177918, "grad_norm": 0.7550913691520691, "learning_rate": 9.812819080074274e-06, "loss": 0.8763, "step": 569 }, { "epoch": 0.11498926703798618, "grad_norm": 0.46390995383262634, "learning_rate": 9.811932318064843e-06, "loss": 0.8953, "step": 570 }, { "epoch": 0.11519100259419317, "grad_norm": 0.983810544013977, "learning_rate": 9.811043500775415e-06, "loss": 0.8389, "step": 571 }, { "epoch": 0.11539273815040016, "grad_norm": 0.5378410220146179, "learning_rate": 9.81015262858562e-06, "loss": 0.8677, "step": 572 }, { "epoch": 0.11559447370660715, "grad_norm": 0.7762982845306396, "learning_rate": 9.80925970187597e-06, "loss": 0.8762, "step": 573 }, { "epoch": 0.11579620926281414, "grad_norm": 0.5548637509346008, "learning_rate": 9.808364721027854e-06, "loss": 0.6611, "step": 574 }, { "epoch": 0.11599794481902115, "grad_norm": 0.6933954954147339, "learning_rate": 9.807467686423536e-06, "loss": 0.7337, "step": 575 }, { "epoch": 0.11619968037522814, "grad_norm": 1.0687283277511597, "learning_rate": 9.80656859844616e-06, "loss": 0.8725, "step": 576 }, { "epoch": 0.11640141593143513, "grad_norm": 1.4584949016571045, "learning_rate": 9.805667457479747e-06, "loss": 0.7366, "step": 577 }, { "epoch": 0.11660315148764212, "grad_norm": 1.2576607465744019, "learning_rate": 9.80476426390919e-06, "loss": 0.7085, "step": 578 }, { "epoch": 0.11680488704384911, "grad_norm": 0.730647087097168, "learning_rate": 9.803859018120265e-06, "loss": 0.7253, "step": 579 }, { "epoch": 0.1170066226000561, "grad_norm": 2.792001724243164, "learning_rate": 9.802951720499623e-06, "loss": 0.7478, "step": 580 }, { "epoch": 0.1172083581562631, "grad_norm": 1.430155634880066, "learning_rate": 9.80204237143479e-06, "loss": 0.7854, "step": 581 }, { "epoch": 0.11741009371247009, "grad_norm": 2.819394826889038, "learning_rate": 9.801130971314165e-06, "loss": 0.8178, "step": 582 }, { "epoch": 0.11761182926867708, "grad_norm": 0.820543646812439, "learning_rate": 9.800217520527031e-06, "loss": 0.8311, "step": 583 }, { "epoch": 0.11781356482488409, "grad_norm": 1.3293335437774658, "learning_rate": 9.799302019463541e-06, "loss": 0.6847, "step": 584 }, { "epoch": 0.11801530038109108, "grad_norm": 0.6411197185516357, "learning_rate": 9.798384468514725e-06, "loss": 0.872, "step": 585 }, { "epoch": 0.11821703593729807, "grad_norm": 0.6462484002113342, "learning_rate": 9.797464868072489e-06, "loss": 0.8846, "step": 586 }, { "epoch": 0.11841877149350506, "grad_norm": 0.8242557048797607, "learning_rate": 9.796543218529612e-06, "loss": 0.8605, "step": 587 }, { "epoch": 0.11862050704971205, "grad_norm": 0.7354720830917358, "learning_rate": 9.795619520279754e-06, "loss": 0.6761, "step": 588 }, { "epoch": 0.11882224260591905, "grad_norm": 1.7302488088607788, "learning_rate": 9.794693773717445e-06, "loss": 0.9048, "step": 589 }, { "epoch": 0.11902397816212604, "grad_norm": 0.757748544216156, "learning_rate": 9.79376597923809e-06, "loss": 0.7343, "step": 590 }, { "epoch": 0.11922571371833303, "grad_norm": 0.8567695021629333, "learning_rate": 9.792836137237973e-06, "loss": 0.7886, "step": 591 }, { "epoch": 0.11942744927454002, "grad_norm": 0.5145772695541382, "learning_rate": 9.791904248114247e-06, "loss": 0.6968, "step": 592 }, { "epoch": 0.11962918483074703, "grad_norm": 0.6528315544128418, "learning_rate": 9.790970312264943e-06, "loss": 0.6785, "step": 593 }, { "epoch": 0.11983092038695402, "grad_norm": 0.6215816140174866, "learning_rate": 9.790034330088964e-06, "loss": 0.7053, "step": 594 }, { "epoch": 0.12003265594316101, "grad_norm": 0.8241238594055176, "learning_rate": 9.78909630198609e-06, "loss": 0.6551, "step": 595 }, { "epoch": 0.120234391499368, "grad_norm": 0.5994774103164673, "learning_rate": 9.788156228356969e-06, "loss": 0.7186, "step": 596 }, { "epoch": 0.12043612705557499, "grad_norm": 1.1751370429992676, "learning_rate": 9.787214109603134e-06, "loss": 0.7103, "step": 597 }, { "epoch": 0.12063786261178198, "grad_norm": 1.848848581314087, "learning_rate": 9.786269946126976e-06, "loss": 0.8232, "step": 598 }, { "epoch": 0.12083959816798898, "grad_norm": 2.087062120437622, "learning_rate": 9.785323738331773e-06, "loss": 0.7161, "step": 599 }, { "epoch": 0.12104133372419597, "grad_norm": 2.3562679290771484, "learning_rate": 9.78437548662167e-06, "loss": 0.7386, "step": 600 }, { "epoch": 0.12124306928040297, "grad_norm": 2.2806203365325928, "learning_rate": 9.783425191401686e-06, "loss": 0.8851, "step": 601 }, { "epoch": 0.12144480483660997, "grad_norm": 1.4156206846237183, "learning_rate": 9.78247285307771e-06, "loss": 0.7962, "step": 602 }, { "epoch": 0.12164654039281696, "grad_norm": 0.6872971653938293, "learning_rate": 9.781518472056507e-06, "loss": 0.7202, "step": 603 }, { "epoch": 0.12184827594902395, "grad_norm": 0.5104350447654724, "learning_rate": 9.780562048745715e-06, "loss": 0.9973, "step": 604 }, { "epoch": 0.12205001150523094, "grad_norm": 0.7299901247024536, "learning_rate": 9.779603583553842e-06, "loss": 0.7373, "step": 605 }, { "epoch": 0.12225174706143793, "grad_norm": 0.38602492213249207, "learning_rate": 9.77864307689027e-06, "loss": 0.7382, "step": 606 }, { "epoch": 0.12245348261764492, "grad_norm": 0.5797430276870728, "learning_rate": 9.777680529165251e-06, "loss": 0.7304, "step": 607 }, { "epoch": 0.12265521817385192, "grad_norm": 0.6225494742393494, "learning_rate": 9.776715940789911e-06, "loss": 0.7016, "step": 608 }, { "epoch": 0.12285695373005891, "grad_norm": 0.610600471496582, "learning_rate": 9.775749312176249e-06, "loss": 0.6823, "step": 609 }, { "epoch": 0.12305868928626591, "grad_norm": 0.5466820597648621, "learning_rate": 9.774780643737126e-06, "loss": 0.7635, "step": 610 }, { "epoch": 0.1232604248424729, "grad_norm": 0.6284288167953491, "learning_rate": 9.773809935886287e-06, "loss": 0.706, "step": 611 }, { "epoch": 0.1234621603986799, "grad_norm": 0.5985273122787476, "learning_rate": 9.77283718903834e-06, "loss": 0.7338, "step": 612 }, { "epoch": 0.12366389595488689, "grad_norm": 0.6078040599822998, "learning_rate": 9.771862403608765e-06, "loss": 0.7114, "step": 613 }, { "epoch": 0.12386563151109388, "grad_norm": 0.5801209807395935, "learning_rate": 9.770885580013917e-06, "loss": 0.7514, "step": 614 }, { "epoch": 0.12406736706730087, "grad_norm": 0.5198795199394226, "learning_rate": 9.769906718671017e-06, "loss": 0.6857, "step": 615 }, { "epoch": 0.12426910262350786, "grad_norm": 0.8949345946311951, "learning_rate": 9.768925819998157e-06, "loss": 0.7517, "step": 616 }, { "epoch": 0.12447083817971485, "grad_norm": 0.8734506964683533, "learning_rate": 9.7679428844143e-06, "loss": 0.8156, "step": 617 }, { "epoch": 0.12467257373592185, "grad_norm": 0.8583992123603821, "learning_rate": 9.766957912339281e-06, "loss": 0.751, "step": 618 }, { "epoch": 0.12487430929212885, "grad_norm": 0.5585662126541138, "learning_rate": 9.7659709041938e-06, "loss": 0.6754, "step": 619 }, { "epoch": 0.12507604484833584, "grad_norm": 0.5230299830436707, "learning_rate": 9.764981860399432e-06, "loss": 0.6779, "step": 620 }, { "epoch": 0.12527778040454282, "grad_norm": 0.48600155115127563, "learning_rate": 9.763990781378616e-06, "loss": 0.8704, "step": 621 }, { "epoch": 0.12547951596074983, "grad_norm": 4.001948833465576, "learning_rate": 9.762997667554666e-06, "loss": 0.8584, "step": 622 }, { "epoch": 0.12568125151695683, "grad_norm": 0.4935590922832489, "learning_rate": 9.762002519351761e-06, "loss": 0.9628, "step": 623 }, { "epoch": 0.1258829870731638, "grad_norm": 5.8011274337768555, "learning_rate": 9.76100533719495e-06, "loss": 0.7344, "step": 624 }, { "epoch": 0.12608472262937082, "grad_norm": 1.531157374382019, "learning_rate": 9.760006121510152e-06, "loss": 1.3136, "step": 625 }, { "epoch": 0.1262864581855778, "grad_norm": 0.5714470744132996, "learning_rate": 9.759004872724153e-06, "loss": 0.7915, "step": 626 }, { "epoch": 0.1264881937417848, "grad_norm": 1.6371656656265259, "learning_rate": 9.758001591264608e-06, "loss": 0.7265, "step": 627 }, { "epoch": 0.12668992929799178, "grad_norm": 1.5811271667480469, "learning_rate": 9.75699627756004e-06, "loss": 0.6997, "step": 628 }, { "epoch": 0.12689166485419878, "grad_norm": 2.01772403717041, "learning_rate": 9.755988932039842e-06, "loss": 0.7678, "step": 629 }, { "epoch": 0.12709340041040576, "grad_norm": 0.5429158210754395, "learning_rate": 9.754979555134267e-06, "loss": 0.7126, "step": 630 }, { "epoch": 0.12729513596661277, "grad_norm": 0.8591412901878357, "learning_rate": 9.753968147274448e-06, "loss": 0.7246, "step": 631 }, { "epoch": 0.12749687152281977, "grad_norm": 2.1179087162017822, "learning_rate": 9.752954708892379e-06, "loss": 0.7497, "step": 632 }, { "epoch": 0.12769860707902675, "grad_norm": 0.6513729095458984, "learning_rate": 9.751939240420916e-06, "loss": 0.7581, "step": 633 }, { "epoch": 0.12790034263523375, "grad_norm": 0.41945937275886536, "learning_rate": 9.750921742293794e-06, "loss": 0.9612, "step": 634 }, { "epoch": 0.12810207819144073, "grad_norm": 0.6694250106811523, "learning_rate": 9.749902214945602e-06, "loss": 0.7349, "step": 635 }, { "epoch": 0.12830381374764774, "grad_norm": 0.6524680256843567, "learning_rate": 9.748880658811806e-06, "loss": 0.6788, "step": 636 }, { "epoch": 0.12850554930385472, "grad_norm": 0.48333224654197693, "learning_rate": 9.747857074328735e-06, "loss": 0.6621, "step": 637 }, { "epoch": 0.12870728486006172, "grad_norm": 0.4791509211063385, "learning_rate": 9.746831461933581e-06, "loss": 0.7244, "step": 638 }, { "epoch": 0.1289090204162687, "grad_norm": 1.133988618850708, "learning_rate": 9.745803822064409e-06, "loss": 0.7146, "step": 639 }, { "epoch": 0.1291107559724757, "grad_norm": 0.5031291246414185, "learning_rate": 9.744774155160143e-06, "loss": 0.9058, "step": 640 }, { "epoch": 0.1293124915286827, "grad_norm": 0.5303350687026978, "learning_rate": 9.743742461660577e-06, "loss": 0.7332, "step": 641 }, { "epoch": 0.1295142270848897, "grad_norm": 1.1249873638153076, "learning_rate": 9.74270874200637e-06, "loss": 0.9229, "step": 642 }, { "epoch": 0.1297159626410967, "grad_norm": 2.115898609161377, "learning_rate": 9.741672996639046e-06, "loss": 0.6615, "step": 643 }, { "epoch": 0.12991769819730367, "grad_norm": 0.8712553381919861, "learning_rate": 9.740635226000994e-06, "loss": 0.6996, "step": 644 }, { "epoch": 0.13011943375351068, "grad_norm": 1.0483100414276123, "learning_rate": 9.739595430535467e-06, "loss": 0.7213, "step": 645 }, { "epoch": 0.13032116930971765, "grad_norm": 0.5651170015335083, "learning_rate": 9.738553610686586e-06, "loss": 0.6623, "step": 646 }, { "epoch": 0.13052290486592466, "grad_norm": 0.599432110786438, "learning_rate": 9.737509766899333e-06, "loss": 0.8433, "step": 647 }, { "epoch": 0.13072464042213164, "grad_norm": 2.7517247200012207, "learning_rate": 9.736463899619557e-06, "loss": 0.6449, "step": 648 }, { "epoch": 0.13092637597833864, "grad_norm": 1.9037710428237915, "learning_rate": 9.73541600929397e-06, "loss": 0.7139, "step": 649 }, { "epoch": 0.13112811153454565, "grad_norm": 5.7445220947265625, "learning_rate": 9.734366096370148e-06, "loss": 0.6772, "step": 650 }, { "epoch": 0.13132984709075263, "grad_norm": 1.8100695610046387, "learning_rate": 9.733314161296534e-06, "loss": 0.704, "step": 651 }, { "epoch": 0.13153158264695963, "grad_norm": 0.4549188017845154, "learning_rate": 9.73226020452243e-06, "loss": 0.8671, "step": 652 }, { "epoch": 0.1317333182031666, "grad_norm": 0.832423210144043, "learning_rate": 9.731204226498006e-06, "loss": 1.0424, "step": 653 }, { "epoch": 0.13193505375937362, "grad_norm": 0.6530823707580566, "learning_rate": 9.730146227674289e-06, "loss": 0.7243, "step": 654 }, { "epoch": 0.1321367893155806, "grad_norm": 0.6458786129951477, "learning_rate": 9.729086208503174e-06, "loss": 0.6997, "step": 655 }, { "epoch": 0.1323385248717876, "grad_norm": 1.432695746421814, "learning_rate": 9.72802416943742e-06, "loss": 0.9292, "step": 656 }, { "epoch": 0.13254026042799458, "grad_norm": 0.6455374360084534, "learning_rate": 9.726960110930648e-06, "loss": 0.7323, "step": 657 }, { "epoch": 0.13274199598420158, "grad_norm": 0.4740164279937744, "learning_rate": 9.725894033437335e-06, "loss": 0.721, "step": 658 }, { "epoch": 0.1329437315404086, "grad_norm": 0.73670494556427, "learning_rate": 9.724825937412832e-06, "loss": 0.676, "step": 659 }, { "epoch": 0.13314546709661557, "grad_norm": 0.5248075723648071, "learning_rate": 9.723755823313342e-06, "loss": 0.694, "step": 660 }, { "epoch": 0.13334720265282257, "grad_norm": 0.4952867329120636, "learning_rate": 9.722683691595933e-06, "loss": 0.7452, "step": 661 }, { "epoch": 0.13354893820902955, "grad_norm": 0.5512588024139404, "learning_rate": 9.72160954271854e-06, "loss": 0.666, "step": 662 }, { "epoch": 0.13375067376523656, "grad_norm": 1.7721054553985596, "learning_rate": 9.720533377139949e-06, "loss": 0.8351, "step": 663 }, { "epoch": 0.13395240932144353, "grad_norm": 0.647897481918335, "learning_rate": 9.719455195319819e-06, "loss": 0.7144, "step": 664 }, { "epoch": 0.13415414487765054, "grad_norm": 0.5909448862075806, "learning_rate": 9.718374997718662e-06, "loss": 0.6794, "step": 665 }, { "epoch": 0.13435588043385754, "grad_norm": 0.6904434561729431, "learning_rate": 9.717292784797854e-06, "loss": 0.7908, "step": 666 }, { "epoch": 0.13455761599006452, "grad_norm": 1.919708013534546, "learning_rate": 9.716208557019632e-06, "loss": 0.7146, "step": 667 }, { "epoch": 0.13475935154627153, "grad_norm": 0.973901093006134, "learning_rate": 9.715122314847093e-06, "loss": 0.7182, "step": 668 }, { "epoch": 0.1349610871024785, "grad_norm": 0.5195472836494446, "learning_rate": 9.714034058744193e-06, "loss": 0.8965, "step": 669 }, { "epoch": 0.1351628226586855, "grad_norm": 1.6600466966629028, "learning_rate": 9.712943789175753e-06, "loss": 0.7199, "step": 670 }, { "epoch": 0.1353645582148925, "grad_norm": 0.6698784828186035, "learning_rate": 9.711851506607446e-06, "loss": 0.7798, "step": 671 }, { "epoch": 0.1355662937710995, "grad_norm": 0.6091451644897461, "learning_rate": 9.710757211505812e-06, "loss": 0.6942, "step": 672 }, { "epoch": 0.13576802932730647, "grad_norm": 0.5566756129264832, "learning_rate": 9.70966090433825e-06, "loss": 0.7163, "step": 673 }, { "epoch": 0.13596976488351348, "grad_norm": 0.8096559047698975, "learning_rate": 9.708562585573013e-06, "loss": 0.7149, "step": 674 }, { "epoch": 0.13617150043972048, "grad_norm": 0.6540467739105225, "learning_rate": 9.707462255679217e-06, "loss": 0.7077, "step": 675 }, { "epoch": 0.13637323599592746, "grad_norm": 0.3687657415866852, "learning_rate": 9.706359915126838e-06, "loss": 0.6766, "step": 676 }, { "epoch": 0.13657497155213447, "grad_norm": 0.8468163013458252, "learning_rate": 9.70525556438671e-06, "loss": 0.7108, "step": 677 }, { "epoch": 0.13677670710834144, "grad_norm": 0.6615922451019287, "learning_rate": 9.704149203930522e-06, "loss": 0.7482, "step": 678 }, { "epoch": 0.13697844266454845, "grad_norm": 0.4638476073741913, "learning_rate": 9.703040834230828e-06, "loss": 0.686, "step": 679 }, { "epoch": 0.13718017822075543, "grad_norm": 0.7426740527153015, "learning_rate": 9.701930455761036e-06, "loss": 0.9151, "step": 680 }, { "epoch": 0.13738191377696243, "grad_norm": 0.6113342046737671, "learning_rate": 9.700818068995407e-06, "loss": 1.0156, "step": 681 }, { "epoch": 0.1375836493331694, "grad_norm": 0.7739665508270264, "learning_rate": 9.699703674409074e-06, "loss": 0.6889, "step": 682 }, { "epoch": 0.13778538488937642, "grad_norm": 0.6738470196723938, "learning_rate": 9.698587272478012e-06, "loss": 0.7311, "step": 683 }, { "epoch": 0.13798712044558342, "grad_norm": 0.7146970629692078, "learning_rate": 9.697468863679065e-06, "loss": 0.685, "step": 684 }, { "epoch": 0.1381888560017904, "grad_norm": 0.555719792842865, "learning_rate": 9.696348448489927e-06, "loss": 1.0687, "step": 685 }, { "epoch": 0.1383905915579974, "grad_norm": 0.5952572226524353, "learning_rate": 9.695226027389154e-06, "loss": 0.7047, "step": 686 }, { "epoch": 0.13859232711420438, "grad_norm": 1.26414954662323, "learning_rate": 9.69410160085615e-06, "loss": 0.7015, "step": 687 }, { "epoch": 0.1387940626704114, "grad_norm": 0.5881784558296204, "learning_rate": 9.692975169371189e-06, "loss": 0.7379, "step": 688 }, { "epoch": 0.13899579822661837, "grad_norm": 0.9572243690490723, "learning_rate": 9.69184673341539e-06, "loss": 0.6954, "step": 689 }, { "epoch": 0.13919753378282537, "grad_norm": 1.2255734205245972, "learning_rate": 9.690716293470735e-06, "loss": 0.6955, "step": 690 }, { "epoch": 0.13939926933903235, "grad_norm": 0.5322736501693726, "learning_rate": 9.689583850020058e-06, "loss": 0.8187, "step": 691 }, { "epoch": 0.13960100489523936, "grad_norm": 0.6318933367729187, "learning_rate": 9.68844940354705e-06, "loss": 0.6888, "step": 692 }, { "epoch": 0.13980274045144636, "grad_norm": 1.0587413311004639, "learning_rate": 9.687312954536255e-06, "loss": 0.7381, "step": 693 }, { "epoch": 0.14000447600765334, "grad_norm": 1.1156810522079468, "learning_rate": 9.68617450347308e-06, "loss": 0.6845, "step": 694 }, { "epoch": 0.14020621156386034, "grad_norm": 1.1569520235061646, "learning_rate": 9.685034050843779e-06, "loss": 0.7295, "step": 695 }, { "epoch": 0.14040794712006732, "grad_norm": 1.456122875213623, "learning_rate": 9.683891597135462e-06, "loss": 0.6988, "step": 696 }, { "epoch": 0.14060968267627433, "grad_norm": 0.5524277687072754, "learning_rate": 9.6827471428361e-06, "loss": 0.7191, "step": 697 }, { "epoch": 0.1408114182324813, "grad_norm": 0.667767345905304, "learning_rate": 9.681600688434509e-06, "loss": 0.6836, "step": 698 }, { "epoch": 0.1410131537886883, "grad_norm": 0.445433109998703, "learning_rate": 9.68045223442037e-06, "loss": 0.815, "step": 699 }, { "epoch": 0.1412148893448953, "grad_norm": 0.8379462957382202, "learning_rate": 9.679301781284209e-06, "loss": 0.9189, "step": 700 }, { "epoch": 0.1414166249011023, "grad_norm": 0.7621182799339294, "learning_rate": 9.67814932951741e-06, "loss": 0.7113, "step": 701 }, { "epoch": 0.1416183604573093, "grad_norm": 0.6904314160346985, "learning_rate": 9.676994879612209e-06, "loss": 0.722, "step": 702 }, { "epoch": 0.14182009601351628, "grad_norm": 1.0283129215240479, "learning_rate": 9.675838432061698e-06, "loss": 0.6987, "step": 703 }, { "epoch": 0.14202183156972328, "grad_norm": 1.1526720523834229, "learning_rate": 9.674679987359822e-06, "loss": 0.8583, "step": 704 }, { "epoch": 0.14222356712593026, "grad_norm": 0.6819185018539429, "learning_rate": 9.673519546001373e-06, "loss": 0.9021, "step": 705 }, { "epoch": 0.14242530268213727, "grad_norm": 1.3759137392044067, "learning_rate": 9.672357108482005e-06, "loss": 0.7268, "step": 706 }, { "epoch": 0.14262703823834424, "grad_norm": 0.46242034435272217, "learning_rate": 9.671192675298218e-06, "loss": 0.6623, "step": 707 }, { "epoch": 0.14282877379455125, "grad_norm": 22.558115005493164, "learning_rate": 9.670026246947367e-06, "loss": 0.6944, "step": 708 }, { "epoch": 0.14303050935075823, "grad_norm": 1.0445126295089722, "learning_rate": 9.668857823927658e-06, "loss": 0.7426, "step": 709 }, { "epoch": 0.14323224490696523, "grad_norm": 0.49592289328575134, "learning_rate": 9.66768740673815e-06, "loss": 0.7744, "step": 710 }, { "epoch": 0.14343398046317224, "grad_norm": 0.9152998924255371, "learning_rate": 9.666514995878755e-06, "loss": 0.8277, "step": 711 }, { "epoch": 0.14363571601937922, "grad_norm": 0.7208297848701477, "learning_rate": 9.665340591850235e-06, "loss": 0.8091, "step": 712 }, { "epoch": 0.14383745157558622, "grad_norm": 0.45589447021484375, "learning_rate": 9.664164195154199e-06, "loss": 0.709, "step": 713 }, { "epoch": 0.1440391871317932, "grad_norm": 0.45049798488616943, "learning_rate": 9.662985806293115e-06, "loss": 0.7353, "step": 714 }, { "epoch": 0.1442409226880002, "grad_norm": 0.7140663862228394, "learning_rate": 9.661805425770298e-06, "loss": 0.7304, "step": 715 }, { "epoch": 0.14444265824420718, "grad_norm": 1.5145859718322754, "learning_rate": 9.660623054089913e-06, "loss": 0.7908, "step": 716 }, { "epoch": 0.1446443938004142, "grad_norm": 0.4583665430545807, "learning_rate": 9.659438691756976e-06, "loss": 0.7136, "step": 717 }, { "epoch": 0.1448461293566212, "grad_norm": 0.868816077709198, "learning_rate": 9.658252339277359e-06, "loss": 0.7388, "step": 718 }, { "epoch": 0.14504786491282817, "grad_norm": 0.7502593994140625, "learning_rate": 9.65706399715777e-06, "loss": 0.7955, "step": 719 }, { "epoch": 0.14524960046903518, "grad_norm": 0.4568173885345459, "learning_rate": 9.655873665905781e-06, "loss": 0.66, "step": 720 }, { "epoch": 0.14545133602524216, "grad_norm": 1.3116257190704346, "learning_rate": 9.654681346029809e-06, "loss": 0.7388, "step": 721 }, { "epoch": 0.14565307158144916, "grad_norm": 0.8519835472106934, "learning_rate": 9.653487038039116e-06, "loss": 1.0086, "step": 722 }, { "epoch": 0.14585480713765614, "grad_norm": 0.7730309367179871, "learning_rate": 9.652290742443818e-06, "loss": 0.7059, "step": 723 }, { "epoch": 0.14605654269386315, "grad_norm": 1.275344967842102, "learning_rate": 9.651092459754879e-06, "loss": 0.7021, "step": 724 }, { "epoch": 0.14625827825007012, "grad_norm": 0.962645411491394, "learning_rate": 9.64989219048411e-06, "loss": 0.799, "step": 725 }, { "epoch": 0.14646001380627713, "grad_norm": 0.774117648601532, "learning_rate": 9.648689935144175e-06, "loss": 0.7324, "step": 726 }, { "epoch": 0.14666174936248413, "grad_norm": 0.6590328812599182, "learning_rate": 9.647485694248579e-06, "loss": 0.7218, "step": 727 }, { "epoch": 0.1468634849186911, "grad_norm": 0.5773506164550781, "learning_rate": 9.646279468311684e-06, "loss": 0.7326, "step": 728 }, { "epoch": 0.14706522047489812, "grad_norm": 0.5804173350334167, "learning_rate": 9.645071257848692e-06, "loss": 0.7098, "step": 729 }, { "epoch": 0.1472669560311051, "grad_norm": 1.1419330835342407, "learning_rate": 9.643861063375657e-06, "loss": 0.8657, "step": 730 }, { "epoch": 0.1474686915873121, "grad_norm": 2.3604445457458496, "learning_rate": 9.642648885409475e-06, "loss": 0.6524, "step": 731 }, { "epoch": 0.14767042714351908, "grad_norm": 1.2862043380737305, "learning_rate": 9.6414347244679e-06, "loss": 0.9618, "step": 732 }, { "epoch": 0.14787216269972608, "grad_norm": 0.5891053080558777, "learning_rate": 9.640218581069522e-06, "loss": 0.6627, "step": 733 }, { "epoch": 0.14807389825593306, "grad_norm": 0.5970614552497864, "learning_rate": 9.639000455733784e-06, "loss": 0.6595, "step": 734 }, { "epoch": 0.14827563381214007, "grad_norm": 1.3071032762527466, "learning_rate": 9.637780348980972e-06, "loss": 0.689, "step": 735 }, { "epoch": 0.14847736936834707, "grad_norm": 0.3851945400238037, "learning_rate": 9.636558261332221e-06, "loss": 0.8366, "step": 736 }, { "epoch": 0.14867910492455405, "grad_norm": 0.7860169410705566, "learning_rate": 9.63533419330951e-06, "loss": 0.7495, "step": 737 }, { "epoch": 0.14888084048076106, "grad_norm": 0.6074318885803223, "learning_rate": 9.634108145435665e-06, "loss": 0.8666, "step": 738 }, { "epoch": 0.14908257603696803, "grad_norm": 0.9100054502487183, "learning_rate": 9.63288011823436e-06, "loss": 0.7006, "step": 739 }, { "epoch": 0.14928431159317504, "grad_norm": 0.504601776599884, "learning_rate": 9.631650112230108e-06, "loss": 0.6408, "step": 740 }, { "epoch": 0.14948604714938202, "grad_norm": 0.7778279185295105, "learning_rate": 9.630418127948273e-06, "loss": 0.7248, "step": 741 }, { "epoch": 0.14968778270558902, "grad_norm": 0.5700371265411377, "learning_rate": 9.629184165915063e-06, "loss": 0.6877, "step": 742 }, { "epoch": 0.149889518261796, "grad_norm": 0.4914577305316925, "learning_rate": 9.627948226657527e-06, "loss": 0.8579, "step": 743 }, { "epoch": 0.150091253818003, "grad_norm": 0.7705051302909851, "learning_rate": 9.626710310703565e-06, "loss": 0.6808, "step": 744 }, { "epoch": 0.15029298937421, "grad_norm": 0.5976186394691467, "learning_rate": 9.625470418581913e-06, "loss": 0.7094, "step": 745 }, { "epoch": 0.150494724930417, "grad_norm": 0.5062031745910645, "learning_rate": 9.62422855082216e-06, "loss": 0.8747, "step": 746 }, { "epoch": 0.150696460486624, "grad_norm": 1.6156225204467773, "learning_rate": 9.622984707954732e-06, "loss": 0.7054, "step": 747 }, { "epoch": 0.15089819604283097, "grad_norm": 0.4361424446105957, "learning_rate": 9.621738890510901e-06, "loss": 0.7104, "step": 748 }, { "epoch": 0.15109993159903798, "grad_norm": 0.5863596796989441, "learning_rate": 9.620491099022786e-06, "loss": 0.6611, "step": 749 }, { "epoch": 0.15130166715524496, "grad_norm": 0.8612884879112244, "learning_rate": 9.61924133402334e-06, "loss": 0.7151, "step": 750 }, { "epoch": 0.15150340271145196, "grad_norm": 1.4638561010360718, "learning_rate": 9.617989596046368e-06, "loss": 0.7716, "step": 751 }, { "epoch": 0.15170513826765894, "grad_norm": 0.8269960284233093, "learning_rate": 9.616735885626516e-06, "loss": 0.7143, "step": 752 }, { "epoch": 0.15190687382386595, "grad_norm": 0.9362808465957642, "learning_rate": 9.615480203299266e-06, "loss": 0.7181, "step": 753 }, { "epoch": 0.15210860938007295, "grad_norm": 0.7409095168113708, "learning_rate": 9.61422254960095e-06, "loss": 0.6982, "step": 754 }, { "epoch": 0.15231034493627993, "grad_norm": 0.42784583568573, "learning_rate": 9.612962925068738e-06, "loss": 0.8009, "step": 755 }, { "epoch": 0.15251208049248693, "grad_norm": 0.9697193503379822, "learning_rate": 9.611701330240644e-06, "loss": 0.803, "step": 756 }, { "epoch": 0.1527138160486939, "grad_norm": 0.6597254276275635, "learning_rate": 9.610437765655522e-06, "loss": 0.7358, "step": 757 }, { "epoch": 0.15291555160490092, "grad_norm": 5.7137675285339355, "learning_rate": 9.609172231853066e-06, "loss": 0.7157, "step": 758 }, { "epoch": 0.1531172871611079, "grad_norm": 1.3956544399261475, "learning_rate": 9.607904729373816e-06, "loss": 0.7055, "step": 759 }, { "epoch": 0.1533190227173149, "grad_norm": 0.8854261636734009, "learning_rate": 9.606635258759146e-06, "loss": 0.6771, "step": 760 }, { "epoch": 0.1535207582735219, "grad_norm": 0.5108949542045593, "learning_rate": 9.605363820551277e-06, "loss": 0.6921, "step": 761 }, { "epoch": 0.15372249382972888, "grad_norm": 0.5805178284645081, "learning_rate": 9.604090415293265e-06, "loss": 0.6944, "step": 762 }, { "epoch": 0.1539242293859359, "grad_norm": 5.874364852905273, "learning_rate": 9.60281504352901e-06, "loss": 0.7408, "step": 763 }, { "epoch": 0.15412596494214287, "grad_norm": 0.6608618497848511, "learning_rate": 9.601537705803253e-06, "loss": 0.7101, "step": 764 }, { "epoch": 0.15432770049834987, "grad_norm": 1.541603446006775, "learning_rate": 9.60025840266157e-06, "loss": 0.7053, "step": 765 }, { "epoch": 0.15452943605455685, "grad_norm": 0.4954194724559784, "learning_rate": 9.598977134650381e-06, "loss": 0.6553, "step": 766 }, { "epoch": 0.15473117161076386, "grad_norm": 1.2848337888717651, "learning_rate": 9.597693902316938e-06, "loss": 0.6651, "step": 767 }, { "epoch": 0.15493290716697083, "grad_norm": 0.6397077441215515, "learning_rate": 9.596408706209344e-06, "loss": 0.7129, "step": 768 }, { "epoch": 0.15513464272317784, "grad_norm": 0.5690326690673828, "learning_rate": 9.595121546876529e-06, "loss": 0.8142, "step": 769 }, { "epoch": 0.15533637827938485, "grad_norm": 0.42553994059562683, "learning_rate": 9.593832424868271e-06, "loss": 0.6814, "step": 770 }, { "epoch": 0.15553811383559182, "grad_norm": 0.9635345339775085, "learning_rate": 9.592541340735177e-06, "loss": 0.7081, "step": 771 }, { "epoch": 0.15573984939179883, "grad_norm": 0.41006505489349365, "learning_rate": 9.5912482950287e-06, "loss": 0.7294, "step": 772 }, { "epoch": 0.1559415849480058, "grad_norm": 0.5492969155311584, "learning_rate": 9.589953288301126e-06, "loss": 0.6791, "step": 773 }, { "epoch": 0.1561433205042128, "grad_norm": 0.5428392887115479, "learning_rate": 9.58865632110558e-06, "loss": 0.7024, "step": 774 }, { "epoch": 0.1563450560604198, "grad_norm": 1.0308176279067993, "learning_rate": 9.587357393996027e-06, "loss": 0.6896, "step": 775 }, { "epoch": 0.1565467916166268, "grad_norm": 0.4440838694572449, "learning_rate": 9.586056507527266e-06, "loss": 0.6864, "step": 776 }, { "epoch": 0.15674852717283377, "grad_norm": 0.4978935122489929, "learning_rate": 9.584753662254932e-06, "loss": 0.719, "step": 777 }, { "epoch": 0.15695026272904078, "grad_norm": 0.8401952385902405, "learning_rate": 9.5834488587355e-06, "loss": 0.7286, "step": 778 }, { "epoch": 0.15715199828524778, "grad_norm": 0.5550390481948853, "learning_rate": 9.582142097526278e-06, "loss": 0.9584, "step": 779 }, { "epoch": 0.15735373384145476, "grad_norm": 3.024136543273926, "learning_rate": 9.580833379185415e-06, "loss": 0.7976, "step": 780 }, { "epoch": 0.15755546939766177, "grad_norm": 0.5451089143753052, "learning_rate": 9.579522704271889e-06, "loss": 0.6971, "step": 781 }, { "epoch": 0.15775720495386875, "grad_norm": 0.6897755861282349, "learning_rate": 9.57821007334552e-06, "loss": 0.8471, "step": 782 }, { "epoch": 0.15795894051007575, "grad_norm": 0.5179625153541565, "learning_rate": 9.576895486966959e-06, "loss": 0.6998, "step": 783 }, { "epoch": 0.15816067606628273, "grad_norm": 0.465382844209671, "learning_rate": 9.575578945697696e-06, "loss": 0.843, "step": 784 }, { "epoch": 0.15836241162248973, "grad_norm": 6.361021518707275, "learning_rate": 9.574260450100054e-06, "loss": 0.7791, "step": 785 }, { "epoch": 0.1585641471786967, "grad_norm": 0.4968310594558716, "learning_rate": 9.57294000073719e-06, "loss": 0.741, "step": 786 }, { "epoch": 0.15876588273490372, "grad_norm": 0.4837093949317932, "learning_rate": 9.571617598173097e-06, "loss": 0.7069, "step": 787 }, { "epoch": 0.15896761829111072, "grad_norm": 0.5080758333206177, "learning_rate": 9.5702932429726e-06, "loss": 0.7032, "step": 788 }, { "epoch": 0.1591693538473177, "grad_norm": 0.3934210538864136, "learning_rate": 9.568966935701362e-06, "loss": 0.6943, "step": 789 }, { "epoch": 0.1593710894035247, "grad_norm": 1.3253793716430664, "learning_rate": 9.567638676925877e-06, "loss": 0.6914, "step": 790 }, { "epoch": 0.15957282495973169, "grad_norm": 0.5028374195098877, "learning_rate": 9.566308467213472e-06, "loss": 0.7356, "step": 791 }, { "epoch": 0.1597745605159387, "grad_norm": 0.7472164630889893, "learning_rate": 9.56497630713231e-06, "loss": 0.6967, "step": 792 }, { "epoch": 0.15997629607214567, "grad_norm": 0.5399154424667358, "learning_rate": 9.563642197251382e-06, "loss": 0.7105, "step": 793 }, { "epoch": 0.16017803162835267, "grad_norm": 0.6258538365364075, "learning_rate": 9.562306138140518e-06, "loss": 0.7155, "step": 794 }, { "epoch": 0.16037976718455965, "grad_norm": 0.44273310899734497, "learning_rate": 9.560968130370376e-06, "loss": 0.6848, "step": 795 }, { "epoch": 0.16058150274076666, "grad_norm": 0.9403547644615173, "learning_rate": 9.55962817451245e-06, "loss": 0.7942, "step": 796 }, { "epoch": 0.16078323829697366, "grad_norm": 0.5547764301300049, "learning_rate": 9.558286271139061e-06, "loss": 0.7035, "step": 797 }, { "epoch": 0.16098497385318064, "grad_norm": 0.4399620592594147, "learning_rate": 9.556942420823368e-06, "loss": 0.6614, "step": 798 }, { "epoch": 0.16118670940938765, "grad_norm": 0.5260964632034302, "learning_rate": 9.555596624139356e-06, "loss": 0.744, "step": 799 }, { "epoch": 0.16138844496559462, "grad_norm": 0.691392183303833, "learning_rate": 9.554248881661845e-06, "loss": 0.7364, "step": 800 }, { "epoch": 0.16159018052180163, "grad_norm": 0.7063742280006409, "learning_rate": 9.552899193966484e-06, "loss": 0.9067, "step": 801 }, { "epoch": 0.1617919160780086, "grad_norm": 0.5435588955879211, "learning_rate": 9.551547561629755e-06, "loss": 0.7611, "step": 802 }, { "epoch": 0.1619936516342156, "grad_norm": 0.5623730421066284, "learning_rate": 9.550193985228968e-06, "loss": 0.7608, "step": 803 }, { "epoch": 0.1621953871904226, "grad_norm": 1.5376967191696167, "learning_rate": 9.548838465342266e-06, "loss": 0.7438, "step": 804 }, { "epoch": 0.1623971227466296, "grad_norm": 0.5251892805099487, "learning_rate": 9.54748100254862e-06, "loss": 0.916, "step": 805 }, { "epoch": 0.1625988583028366, "grad_norm": 0.7586867213249207, "learning_rate": 9.54612159742783e-06, "loss": 0.84, "step": 806 }, { "epoch": 0.16280059385904358, "grad_norm": 0.3578733205795288, "learning_rate": 9.544760250560531e-06, "loss": 0.713, "step": 807 }, { "epoch": 0.16300232941525059, "grad_norm": 1.0173057317733765, "learning_rate": 9.54339696252818e-06, "loss": 0.7829, "step": 808 }, { "epoch": 0.16320406497145756, "grad_norm": 0.5583421587944031, "learning_rate": 9.542031733913069e-06, "loss": 0.7975, "step": 809 }, { "epoch": 0.16340580052766457, "grad_norm": 0.409379780292511, "learning_rate": 9.540664565298315e-06, "loss": 0.6718, "step": 810 }, { "epoch": 0.16360753608387155, "grad_norm": 0.44500353932380676, "learning_rate": 9.539295457267865e-06, "loss": 0.7731, "step": 811 }, { "epoch": 0.16380927164007855, "grad_norm": 0.49359723925590515, "learning_rate": 9.537924410406495e-06, "loss": 0.6798, "step": 812 }, { "epoch": 0.16401100719628556, "grad_norm": 0.5281481742858887, "learning_rate": 9.536551425299812e-06, "loss": 0.71, "step": 813 }, { "epoch": 0.16421274275249254, "grad_norm": 0.4787105321884155, "learning_rate": 9.535176502534242e-06, "loss": 0.7343, "step": 814 }, { "epoch": 0.16441447830869954, "grad_norm": 0.5769368410110474, "learning_rate": 9.533799642697047e-06, "loss": 0.6915, "step": 815 }, { "epoch": 0.16461621386490652, "grad_norm": 0.44727253913879395, "learning_rate": 9.532420846376316e-06, "loss": 0.86, "step": 816 }, { "epoch": 0.16481794942111352, "grad_norm": 0.5646665692329407, "learning_rate": 9.531040114160958e-06, "loss": 0.8393, "step": 817 }, { "epoch": 0.1650196849773205, "grad_norm": 1.3017197847366333, "learning_rate": 9.529657446640714e-06, "loss": 0.7177, "step": 818 }, { "epoch": 0.1652214205335275, "grad_norm": 1.5386371612548828, "learning_rate": 9.528272844406154e-06, "loss": 0.8101, "step": 819 }, { "epoch": 0.16542315608973449, "grad_norm": 0.4335991144180298, "learning_rate": 9.52688630804867e-06, "loss": 0.6833, "step": 820 }, { "epoch": 0.1656248916459415, "grad_norm": 0.8145807981491089, "learning_rate": 9.52549783816048e-06, "loss": 0.8993, "step": 821 }, { "epoch": 0.1658266272021485, "grad_norm": 0.40622222423553467, "learning_rate": 9.524107435334633e-06, "loss": 0.8246, "step": 822 }, { "epoch": 0.16602836275835547, "grad_norm": 2.768005609512329, "learning_rate": 9.522715100164996e-06, "loss": 0.9468, "step": 823 }, { "epoch": 0.16623009831456248, "grad_norm": 0.8487063050270081, "learning_rate": 9.521320833246268e-06, "loss": 1.1553, "step": 824 }, { "epoch": 0.16643183387076946, "grad_norm": 0.8210890889167786, "learning_rate": 9.51992463517397e-06, "loss": 0.7387, "step": 825 }, { "epoch": 0.16663356942697646, "grad_norm": 0.40662023425102234, "learning_rate": 9.518526506544447e-06, "loss": 1.0524, "step": 826 }, { "epoch": 0.16683530498318344, "grad_norm": 0.932871401309967, "learning_rate": 9.517126447954872e-06, "loss": 0.7819, "step": 827 }, { "epoch": 0.16703704053939045, "grad_norm": 0.5390327572822571, "learning_rate": 9.515724460003238e-06, "loss": 0.7195, "step": 828 }, { "epoch": 0.16723877609559742, "grad_norm": 0.38334664702415466, "learning_rate": 9.514320543288367e-06, "loss": 0.6943, "step": 829 }, { "epoch": 0.16744051165180443, "grad_norm": 1.7166111469268799, "learning_rate": 9.512914698409898e-06, "loss": 0.9641, "step": 830 }, { "epoch": 0.16764224720801144, "grad_norm": 0.5833882093429565, "learning_rate": 9.511506925968302e-06, "loss": 0.6696, "step": 831 }, { "epoch": 0.1678439827642184, "grad_norm": 0.7301717400550842, "learning_rate": 9.510097226564866e-06, "loss": 0.8498, "step": 832 }, { "epoch": 0.16804571832042542, "grad_norm": 0.4588780701160431, "learning_rate": 9.508685600801704e-06, "loss": 0.6698, "step": 833 }, { "epoch": 0.1682474538766324, "grad_norm": 0.4560970366001129, "learning_rate": 9.507272049281752e-06, "loss": 0.7969, "step": 834 }, { "epoch": 0.1684491894328394, "grad_norm": 0.5290451645851135, "learning_rate": 9.50585657260877e-06, "loss": 0.7079, "step": 835 }, { "epoch": 0.16865092498904638, "grad_norm": 0.49037396907806396, "learning_rate": 9.504439171387334e-06, "loss": 1.0519, "step": 836 }, { "epoch": 0.16885266054525339, "grad_norm": 0.5754956603050232, "learning_rate": 9.503019846222849e-06, "loss": 0.745, "step": 837 }, { "epoch": 0.16905439610146036, "grad_norm": 0.7032858729362488, "learning_rate": 9.501598597721542e-06, "loss": 0.7016, "step": 838 }, { "epoch": 0.16925613165766737, "grad_norm": 0.5736189484596252, "learning_rate": 9.500175426490455e-06, "loss": 1.0468, "step": 839 }, { "epoch": 0.16945786721387437, "grad_norm": 0.48139849305152893, "learning_rate": 9.498750333137456e-06, "loss": 0.8038, "step": 840 }, { "epoch": 0.16965960277008135, "grad_norm": 0.5781999230384827, "learning_rate": 9.497323318271237e-06, "loss": 0.6912, "step": 841 }, { "epoch": 0.16986133832628836, "grad_norm": 0.41525569558143616, "learning_rate": 9.4958943825013e-06, "loss": 0.8743, "step": 842 }, { "epoch": 0.17006307388249534, "grad_norm": 0.4945438504219055, "learning_rate": 9.494463526437979e-06, "loss": 0.7092, "step": 843 }, { "epoch": 0.17026480943870234, "grad_norm": 0.7058941125869751, "learning_rate": 9.493030750692422e-06, "loss": 0.7403, "step": 844 }, { "epoch": 0.17046654499490932, "grad_norm": 2.3633134365081787, "learning_rate": 9.4915960558766e-06, "loss": 0.6802, "step": 845 }, { "epoch": 0.17066828055111632, "grad_norm": 0.5798237323760986, "learning_rate": 9.4901594426033e-06, "loss": 0.7799, "step": 846 }, { "epoch": 0.1708700161073233, "grad_norm": 1.2589781284332275, "learning_rate": 9.488720911486131e-06, "loss": 0.6914, "step": 847 }, { "epoch": 0.1710717516635303, "grad_norm": 0.611457109451294, "learning_rate": 9.487280463139521e-06, "loss": 0.6979, "step": 848 }, { "epoch": 0.1712734872197373, "grad_norm": 0.846007227897644, "learning_rate": 9.485838098178715e-06, "loss": 0.9793, "step": 849 }, { "epoch": 0.1714752227759443, "grad_norm": 0.4944300949573517, "learning_rate": 9.48439381721978e-06, "loss": 0.6902, "step": 850 }, { "epoch": 0.1716769583321513, "grad_norm": 0.3780556321144104, "learning_rate": 9.482947620879601e-06, "loss": 0.8286, "step": 851 }, { "epoch": 0.17187869388835827, "grad_norm": 0.8853325247764587, "learning_rate": 9.481499509775878e-06, "loss": 0.7205, "step": 852 }, { "epoch": 0.17208042944456528, "grad_norm": 0.7118326425552368, "learning_rate": 9.480049484527127e-06, "loss": 0.7851, "step": 853 }, { "epoch": 0.17228216500077226, "grad_norm": 0.4678933620452881, "learning_rate": 9.47859754575269e-06, "loss": 0.6764, "step": 854 }, { "epoch": 0.17248390055697926, "grad_norm": 0.9232149124145508, "learning_rate": 9.477143694072721e-06, "loss": 0.7186, "step": 855 }, { "epoch": 0.17268563611318627, "grad_norm": 0.6610773801803589, "learning_rate": 9.475687930108188e-06, "loss": 0.6943, "step": 856 }, { "epoch": 0.17288737166939325, "grad_norm": 0.5761659741401672, "learning_rate": 9.47423025448088e-06, "loss": 0.6806, "step": 857 }, { "epoch": 0.17308910722560025, "grad_norm": 0.448404461145401, "learning_rate": 9.472770667813406e-06, "loss": 0.7138, "step": 858 }, { "epoch": 0.17329084278180723, "grad_norm": 0.4232058823108673, "learning_rate": 9.471309170729182e-06, "loss": 0.6921, "step": 859 }, { "epoch": 0.17349257833801424, "grad_norm": 0.4294980466365814, "learning_rate": 9.469845763852447e-06, "loss": 0.8345, "step": 860 }, { "epoch": 0.17369431389422121, "grad_norm": 0.40628236532211304, "learning_rate": 9.468380447808251e-06, "loss": 0.7114, "step": 861 }, { "epoch": 0.17389604945042822, "grad_norm": 0.42344218492507935, "learning_rate": 9.466913223222467e-06, "loss": 0.851, "step": 862 }, { "epoch": 0.1740977850066352, "grad_norm": 1.0432575941085815, "learning_rate": 9.465444090721775e-06, "loss": 0.8181, "step": 863 }, { "epoch": 0.1742995205628422, "grad_norm": 0.848474383354187, "learning_rate": 9.463973050933674e-06, "loss": 0.7649, "step": 864 }, { "epoch": 0.1745012561190492, "grad_norm": 0.513373851776123, "learning_rate": 9.462500104486476e-06, "loss": 0.6926, "step": 865 }, { "epoch": 0.1747029916752562, "grad_norm": 1.0103217363357544, "learning_rate": 9.461025252009308e-06, "loss": 0.7174, "step": 866 }, { "epoch": 0.1749047272314632, "grad_norm": 0.5421655178070068, "learning_rate": 9.45954849413211e-06, "loss": 0.7147, "step": 867 }, { "epoch": 0.17510646278767017, "grad_norm": 1.8900377750396729, "learning_rate": 9.458069831485643e-06, "loss": 0.6938, "step": 868 }, { "epoch": 0.17530819834387718, "grad_norm": 0.7132319808006287, "learning_rate": 9.45658926470147e-06, "loss": 0.6805, "step": 869 }, { "epoch": 0.17550993390008415, "grad_norm": 1.216712474822998, "learning_rate": 9.455106794411974e-06, "loss": 0.7156, "step": 870 }, { "epoch": 0.17571166945629116, "grad_norm": 1.3393114805221558, "learning_rate": 9.453622421250353e-06, "loss": 0.6761, "step": 871 }, { "epoch": 0.17591340501249814, "grad_norm": 0.6455486416816711, "learning_rate": 9.45213614585061e-06, "loss": 0.6961, "step": 872 }, { "epoch": 0.17611514056870514, "grad_norm": 0.6910165548324585, "learning_rate": 9.45064796884757e-06, "loss": 0.8834, "step": 873 }, { "epoch": 0.17631687612491215, "grad_norm": 1.8813116550445557, "learning_rate": 9.449157890876862e-06, "loss": 0.6928, "step": 874 }, { "epoch": 0.17651861168111913, "grad_norm": 2.003828763961792, "learning_rate": 9.44766591257493e-06, "loss": 0.6792, "step": 875 }, { "epoch": 0.17672034723732613, "grad_norm": 2.5982606410980225, "learning_rate": 9.446172034579034e-06, "loss": 0.9014, "step": 876 }, { "epoch": 0.1769220827935331, "grad_norm": 0.8602038621902466, "learning_rate": 9.44467625752724e-06, "loss": 0.8226, "step": 877 }, { "epoch": 0.17712381834974011, "grad_norm": 0.7308774590492249, "learning_rate": 9.443178582058423e-06, "loss": 0.8395, "step": 878 }, { "epoch": 0.1773255539059471, "grad_norm": 0.41064178943634033, "learning_rate": 9.441679008812277e-06, "loss": 0.9711, "step": 879 }, { "epoch": 0.1775272894621541, "grad_norm": 0.43151190876960754, "learning_rate": 9.440177538429299e-06, "loss": 0.7408, "step": 880 }, { "epoch": 0.17772902501836108, "grad_norm": 0.5189063549041748, "learning_rate": 9.438674171550801e-06, "loss": 0.7126, "step": 881 }, { "epoch": 0.17793076057456808, "grad_norm": 0.5402753949165344, "learning_rate": 9.437168908818904e-06, "loss": 0.6815, "step": 882 }, { "epoch": 0.1781324961307751, "grad_norm": 0.7131788730621338, "learning_rate": 9.435661750876537e-06, "loss": 0.7269, "step": 883 }, { "epoch": 0.17833423168698206, "grad_norm": 0.8747779726982117, "learning_rate": 9.43415269836744e-06, "loss": 0.7123, "step": 884 }, { "epoch": 0.17853596724318907, "grad_norm": 0.8151087760925293, "learning_rate": 9.432641751936162e-06, "loss": 0.7792, "step": 885 }, { "epoch": 0.17873770279939605, "grad_norm": 0.45538297295570374, "learning_rate": 9.43112891222806e-06, "loss": 0.6891, "step": 886 }, { "epoch": 0.17893943835560305, "grad_norm": 0.8422892689704895, "learning_rate": 9.429614179889302e-06, "loss": 0.7761, "step": 887 }, { "epoch": 0.17914117391181003, "grad_norm": 0.9492774605751038, "learning_rate": 9.428097555566859e-06, "loss": 0.6858, "step": 888 }, { "epoch": 0.17934290946801704, "grad_norm": 0.5029323697090149, "learning_rate": 9.42657903990852e-06, "loss": 0.7098, "step": 889 }, { "epoch": 0.17954464502422401, "grad_norm": 0.755977213382721, "learning_rate": 9.42505863356287e-06, "loss": 0.83, "step": 890 }, { "epoch": 0.17974638058043102, "grad_norm": 0.5003045201301575, "learning_rate": 9.42353633717931e-06, "loss": 0.8131, "step": 891 }, { "epoch": 0.17994811613663803, "grad_norm": 0.6524782180786133, "learning_rate": 9.422012151408046e-06, "loss": 0.7094, "step": 892 }, { "epoch": 0.180149851692845, "grad_norm": 0.8290368318557739, "learning_rate": 9.42048607690009e-06, "loss": 0.6777, "step": 893 }, { "epoch": 0.180351587249052, "grad_norm": 0.718972384929657, "learning_rate": 9.418958114307263e-06, "loss": 0.7676, "step": 894 }, { "epoch": 0.180553322805259, "grad_norm": 0.4021669030189514, "learning_rate": 9.417428264282186e-06, "loss": 0.6786, "step": 895 }, { "epoch": 0.180755058361466, "grad_norm": 0.5108888149261475, "learning_rate": 9.415896527478297e-06, "loss": 0.8632, "step": 896 }, { "epoch": 0.18095679391767297, "grad_norm": 0.4423260986804962, "learning_rate": 9.414362904549829e-06, "loss": 0.7235, "step": 897 }, { "epoch": 0.18115852947387998, "grad_norm": 1.0673054456710815, "learning_rate": 9.412827396151827e-06, "loss": 0.7061, "step": 898 }, { "epoch": 0.18136026503008695, "grad_norm": 0.6691890954971313, "learning_rate": 9.411290002940141e-06, "loss": 0.7062, "step": 899 }, { "epoch": 0.18156200058629396, "grad_norm": 0.49868547916412354, "learning_rate": 9.409750725571422e-06, "loss": 0.76, "step": 900 }, { "epoch": 0.18176373614250096, "grad_norm": 1.40470552444458, "learning_rate": 9.408209564703133e-06, "loss": 0.6882, "step": 901 }, { "epoch": 0.18196547169870794, "grad_norm": 1.4107697010040283, "learning_rate": 9.40666652099353e-06, "loss": 0.8416, "step": 902 }, { "epoch": 0.18216720725491495, "grad_norm": 0.4921844005584717, "learning_rate": 9.405121595101688e-06, "loss": 0.8499, "step": 903 }, { "epoch": 0.18236894281112193, "grad_norm": 1.1987202167510986, "learning_rate": 9.403574787687474e-06, "loss": 0.6613, "step": 904 }, { "epoch": 0.18257067836732893, "grad_norm": 2.4473557472229004, "learning_rate": 9.402026099411563e-06, "loss": 0.9773, "step": 905 }, { "epoch": 0.1827724139235359, "grad_norm": 0.5587297677993774, "learning_rate": 9.400475530935433e-06, "loss": 0.7263, "step": 906 }, { "epoch": 0.18297414947974291, "grad_norm": 1.1013250350952148, "learning_rate": 9.398923082921366e-06, "loss": 0.8863, "step": 907 }, { "epoch": 0.18317588503594992, "grad_norm": 1.5462638139724731, "learning_rate": 9.397368756032445e-06, "loss": 0.6677, "step": 908 }, { "epoch": 0.1833776205921569, "grad_norm": 0.9664633870124817, "learning_rate": 9.395812550932559e-06, "loss": 0.6855, "step": 909 }, { "epoch": 0.1835793561483639, "grad_norm": 0.5382829904556274, "learning_rate": 9.394254468286395e-06, "loss": 0.7645, "step": 910 }, { "epoch": 0.18378109170457088, "grad_norm": 1.6417592763900757, "learning_rate": 9.392694508759443e-06, "loss": 0.7324, "step": 911 }, { "epoch": 0.1839828272607779, "grad_norm": 1.6140865087509155, "learning_rate": 9.391132673017995e-06, "loss": 0.6887, "step": 912 }, { "epoch": 0.18418456281698486, "grad_norm": 1031.276123046875, "learning_rate": 9.389568961729148e-06, "loss": 0.8069, "step": 913 }, { "epoch": 0.18438629837319187, "grad_norm": 16818.4453125, "learning_rate": 9.388003375560792e-06, "loss": 0.6865, "step": 914 }, { "epoch": 0.18458803392939885, "grad_norm": 1720.62548828125, "learning_rate": 9.386435915181626e-06, "loss": 0.7457, "step": 915 }, { "epoch": 0.18478976948560585, "grad_norm": 680.1848754882812, "learning_rate": 9.384866581261145e-06, "loss": 0.7248, "step": 916 }, { "epoch": 0.18499150504181286, "grad_norm": 2.356326103210449, "learning_rate": 9.383295374469646e-06, "loss": 0.759, "step": 917 }, { "epoch": 0.18519324059801984, "grad_norm": 1.6549099683761597, "learning_rate": 9.381722295478227e-06, "loss": 0.7201, "step": 918 }, { "epoch": 0.18539497615422684, "grad_norm": 0.6107184886932373, "learning_rate": 9.380147344958778e-06, "loss": 0.7285, "step": 919 }, { "epoch": 0.18559671171043382, "grad_norm": 0.675512433052063, "learning_rate": 9.378570523583999e-06, "loss": 0.7225, "step": 920 }, { "epoch": 0.18579844726664083, "grad_norm": 0.6857936382293701, "learning_rate": 9.376991832027385e-06, "loss": 0.8173, "step": 921 }, { "epoch": 0.1860001828228478, "grad_norm": 0.451214998960495, "learning_rate": 9.375411270963226e-06, "loss": 0.6989, "step": 922 }, { "epoch": 0.1862019183790548, "grad_norm": 1.1534514427185059, "learning_rate": 9.373828841066616e-06, "loss": 1.0878, "step": 923 }, { "epoch": 0.1864036539352618, "grad_norm": 0.38888052105903625, "learning_rate": 9.372244543013444e-06, "loss": 0.6825, "step": 924 }, { "epoch": 0.1866053894914688, "grad_norm": 0.766791820526123, "learning_rate": 9.370658377480399e-06, "loss": 0.6974, "step": 925 }, { "epoch": 0.1868071250476758, "grad_norm": 0.7451943159103394, "learning_rate": 9.369070345144966e-06, "loss": 0.6389, "step": 926 }, { "epoch": 0.18700886060388278, "grad_norm": 0.5548701286315918, "learning_rate": 9.367480446685427e-06, "loss": 0.8277, "step": 927 }, { "epoch": 0.18721059616008978, "grad_norm": 0.42171385884284973, "learning_rate": 9.365888682780862e-06, "loss": 0.8162, "step": 928 }, { "epoch": 0.18741233171629676, "grad_norm": 0.8420100212097168, "learning_rate": 9.364295054111147e-06, "loss": 0.7352, "step": 929 }, { "epoch": 0.18761406727250377, "grad_norm": 0.5614784955978394, "learning_rate": 9.362699561356957e-06, "loss": 0.676, "step": 930 }, { "epoch": 0.18781580282871074, "grad_norm": 1.4516627788543701, "learning_rate": 9.361102205199762e-06, "loss": 0.7014, "step": 931 }, { "epoch": 0.18801753838491775, "grad_norm": 1.1581453084945679, "learning_rate": 9.359502986321823e-06, "loss": 0.6991, "step": 932 }, { "epoch": 0.18821927394112473, "grad_norm": 0.8086721897125244, "learning_rate": 9.357901905406204e-06, "loss": 0.7994, "step": 933 }, { "epoch": 0.18842100949733173, "grad_norm": 1.4955883026123047, "learning_rate": 9.356298963136763e-06, "loss": 0.7284, "step": 934 }, { "epoch": 0.18862274505353874, "grad_norm": 0.9167599081993103, "learning_rate": 9.354694160198146e-06, "loss": 0.7033, "step": 935 }, { "epoch": 0.18882448060974572, "grad_norm": 1.0095226764678955, "learning_rate": 9.353087497275804e-06, "loss": 0.6968, "step": 936 }, { "epoch": 0.18902621616595272, "grad_norm": 0.897120475769043, "learning_rate": 9.351478975055973e-06, "loss": 0.7284, "step": 937 }, { "epoch": 0.1892279517221597, "grad_norm": 1.160873293876648, "learning_rate": 9.349868594225692e-06, "loss": 1.2516, "step": 938 }, { "epoch": 0.1894296872783667, "grad_norm": 0.9946362376213074, "learning_rate": 9.348256355472787e-06, "loss": 0.7964, "step": 939 }, { "epoch": 0.18963142283457368, "grad_norm": 0.6938387155532837, "learning_rate": 9.34664225948588e-06, "loss": 0.6529, "step": 940 }, { "epoch": 0.1898331583907807, "grad_norm": 0.7048420310020447, "learning_rate": 9.345026306954385e-06, "loss": 0.6904, "step": 941 }, { "epoch": 0.19003489394698767, "grad_norm": 0.8328052163124084, "learning_rate": 9.343408498568512e-06, "loss": 0.7811, "step": 942 }, { "epoch": 0.19023662950319467, "grad_norm": 0.4666134715080261, "learning_rate": 9.34178883501926e-06, "loss": 0.7828, "step": 943 }, { "epoch": 0.19043836505940168, "grad_norm": 0.39586102962493896, "learning_rate": 9.340167316998425e-06, "loss": 0.7562, "step": 944 }, { "epoch": 0.19064010061560865, "grad_norm": 0.4975466728210449, "learning_rate": 9.33854394519859e-06, "loss": 0.787, "step": 945 }, { "epoch": 0.19084183617181566, "grad_norm": 0.43210187554359436, "learning_rate": 9.336918720313133e-06, "loss": 0.8803, "step": 946 }, { "epoch": 0.19104357172802264, "grad_norm": 0.6917931437492371, "learning_rate": 9.335291643036221e-06, "loss": 0.6514, "step": 947 }, { "epoch": 0.19124530728422964, "grad_norm": 0.5619732737541199, "learning_rate": 9.333662714062818e-06, "loss": 0.6951, "step": 948 }, { "epoch": 0.19144704284043662, "grad_norm": 0.9777224063873291, "learning_rate": 9.33203193408867e-06, "loss": 0.6511, "step": 949 }, { "epoch": 0.19164877839664363, "grad_norm": 0.5258626341819763, "learning_rate": 9.33039930381032e-06, "loss": 0.7898, "step": 950 }, { "epoch": 0.1918505139528506, "grad_norm": 0.48357635736465454, "learning_rate": 9.3287648239251e-06, "loss": 0.7166, "step": 951 }, { "epoch": 0.1920522495090576, "grad_norm": 0.48291975259780884, "learning_rate": 9.32712849513113e-06, "loss": 0.6984, "step": 952 }, { "epoch": 0.19225398506526462, "grad_norm": 0.9965933561325073, "learning_rate": 9.325490318127323e-06, "loss": 0.7334, "step": 953 }, { "epoch": 0.1924557206214716, "grad_norm": 0.562293291091919, "learning_rate": 9.32385029361338e-06, "loss": 0.6767, "step": 954 }, { "epoch": 0.1926574561776786, "grad_norm": 1.084022879600525, "learning_rate": 9.32220842228979e-06, "loss": 0.6759, "step": 955 }, { "epoch": 0.19285919173388558, "grad_norm": 0.804706335067749, "learning_rate": 9.32056470485783e-06, "loss": 0.7241, "step": 956 }, { "epoch": 0.19306092729009258, "grad_norm": 0.3961019814014435, "learning_rate": 9.318919142019572e-06, "loss": 0.7219, "step": 957 }, { "epoch": 0.19326266284629956, "grad_norm": 0.5734902620315552, "learning_rate": 9.317271734477865e-06, "loss": 0.6651, "step": 958 }, { "epoch": 0.19346439840250657, "grad_norm": 0.4710521101951599, "learning_rate": 9.315622482936356e-06, "loss": 0.6818, "step": 959 }, { "epoch": 0.19366613395871357, "grad_norm": 0.881702721118927, "learning_rate": 9.313971388099476e-06, "loss": 0.7027, "step": 960 }, { "epoch": 0.19386786951492055, "grad_norm": 0.4663563668727875, "learning_rate": 9.312318450672441e-06, "loss": 0.6915, "step": 961 }, { "epoch": 0.19406960507112755, "grad_norm": 0.8489170670509338, "learning_rate": 9.31066367136126e-06, "loss": 0.6745, "step": 962 }, { "epoch": 0.19427134062733453, "grad_norm": 0.6792709827423096, "learning_rate": 9.309007050872722e-06, "loss": 0.8499, "step": 963 }, { "epoch": 0.19447307618354154, "grad_norm": 0.7607674598693848, "learning_rate": 9.307348589914405e-06, "loss": 0.6581, "step": 964 }, { "epoch": 0.19467481173974852, "grad_norm": 0.74134761095047, "learning_rate": 9.305688289194673e-06, "loss": 0.6741, "step": 965 }, { "epoch": 0.19487654729595552, "grad_norm": 0.8747991323471069, "learning_rate": 9.30402614942268e-06, "loss": 0.6974, "step": 966 }, { "epoch": 0.1950782828521625, "grad_norm": 0.7645976543426514, "learning_rate": 9.302362171308358e-06, "loss": 0.6842, "step": 967 }, { "epoch": 0.1952800184083695, "grad_norm": 1.2800146341323853, "learning_rate": 9.30069635556243e-06, "loss": 0.6564, "step": 968 }, { "epoch": 0.1954817539645765, "grad_norm": 0.6098384857177734, "learning_rate": 9.299028702896402e-06, "loss": 0.8256, "step": 969 }, { "epoch": 0.1956834895207835, "grad_norm": 0.6422080993652344, "learning_rate": 9.29735921402256e-06, "loss": 0.6772, "step": 970 }, { "epoch": 0.1958852250769905, "grad_norm": 0.42048829793930054, "learning_rate": 9.295687889653986e-06, "loss": 0.6898, "step": 971 }, { "epoch": 0.19608696063319747, "grad_norm": 0.3918842673301697, "learning_rate": 9.294014730504532e-06, "loss": 0.7526, "step": 972 }, { "epoch": 0.19628869618940448, "grad_norm": 0.3806288242340088, "learning_rate": 9.292339737288844e-06, "loss": 0.7355, "step": 973 }, { "epoch": 0.19649043174561145, "grad_norm": 0.6645257472991943, "learning_rate": 9.290662910722346e-06, "loss": 0.6992, "step": 974 }, { "epoch": 0.19669216730181846, "grad_norm": 1.5524142980575562, "learning_rate": 9.288984251521246e-06, "loss": 0.7014, "step": 975 }, { "epoch": 0.19689390285802544, "grad_norm": 1.6902189254760742, "learning_rate": 9.28730376040254e-06, "loss": 0.6856, "step": 976 }, { "epoch": 0.19709563841423244, "grad_norm": 0.6375247836112976, "learning_rate": 9.285621438083997e-06, "loss": 0.6822, "step": 977 }, { "epoch": 0.19729737397043945, "grad_norm": 0.44163259863853455, "learning_rate": 9.283937285284177e-06, "loss": 0.7284, "step": 978 }, { "epoch": 0.19749910952664643, "grad_norm": 0.6588442325592041, "learning_rate": 9.282251302722416e-06, "loss": 0.7268, "step": 979 }, { "epoch": 0.19770084508285343, "grad_norm": 1.0238844156265259, "learning_rate": 9.280563491118833e-06, "loss": 0.6698, "step": 980 }, { "epoch": 0.1979025806390604, "grad_norm": 0.7650852203369141, "learning_rate": 9.278873851194328e-06, "loss": 1.0946, "step": 981 }, { "epoch": 0.19810431619526742, "grad_norm": 0.6054670214653015, "learning_rate": 9.277182383670584e-06, "loss": 0.6911, "step": 982 }, { "epoch": 0.1983060517514744, "grad_norm": 0.33824777603149414, "learning_rate": 9.275489089270064e-06, "loss": 0.6725, "step": 983 }, { "epoch": 0.1985077873076814, "grad_norm": 0.6010570526123047, "learning_rate": 9.27379396871601e-06, "loss": 0.6839, "step": 984 }, { "epoch": 0.19870952286388838, "grad_norm": 0.6706582903862, "learning_rate": 9.272097022732444e-06, "loss": 0.6889, "step": 985 }, { "epoch": 0.19891125842009538, "grad_norm": 0.4594559669494629, "learning_rate": 9.270398252044169e-06, "loss": 0.7069, "step": 986 }, { "epoch": 0.1991129939763024, "grad_norm": 0.5065109133720398, "learning_rate": 9.268697657376765e-06, "loss": 0.7619, "step": 987 }, { "epoch": 0.19931472953250937, "grad_norm": 0.5241512060165405, "learning_rate": 9.266995239456593e-06, "loss": 0.8801, "step": 988 }, { "epoch": 0.19951646508871637, "grad_norm": 0.4009891152381897, "learning_rate": 9.265290999010794e-06, "loss": 0.9087, "step": 989 }, { "epoch": 0.19971820064492335, "grad_norm": 0.8747932314872742, "learning_rate": 9.263584936767282e-06, "loss": 0.8327, "step": 990 }, { "epoch": 0.19991993620113035, "grad_norm": 1.358296513557434, "learning_rate": 9.26187705345476e-06, "loss": 0.7112, "step": 991 }, { "epoch": 0.20012167175733733, "grad_norm": 0.3803250789642334, "learning_rate": 9.260167349802696e-06, "loss": 0.7441, "step": 992 }, { "epoch": 0.20032340731354434, "grad_norm": 0.6250929236412048, "learning_rate": 9.258455826541341e-06, "loss": 0.7045, "step": 993 }, { "epoch": 0.20052514286975132, "grad_norm": 0.6147077679634094, "learning_rate": 9.256742484401728e-06, "loss": 0.7959, "step": 994 }, { "epoch": 0.20072687842595832, "grad_norm": 0.6384791731834412, "learning_rate": 9.255027324115657e-06, "loss": 0.7937, "step": 995 }, { "epoch": 0.20092861398216533, "grad_norm": 3.0452754497528076, "learning_rate": 9.253310346415714e-06, "loss": 0.6903, "step": 996 }, { "epoch": 0.2011303495383723, "grad_norm": 3.768707036972046, "learning_rate": 9.251591552035255e-06, "loss": 0.7988, "step": 997 }, { "epoch": 0.2013320850945793, "grad_norm": 0.8928516507148743, "learning_rate": 9.249870941708416e-06, "loss": 0.6586, "step": 998 }, { "epoch": 0.2015338206507863, "grad_norm": 0.6847289204597473, "learning_rate": 9.248148516170106e-06, "loss": 0.6983, "step": 999 }, { "epoch": 0.2017355562069933, "grad_norm": 1.5991216897964478, "learning_rate": 9.246424276156008e-06, "loss": 0.7384, "step": 1000 }, { "epoch": 0.20193729176320027, "grad_norm": 1.0704641342163086, "learning_rate": 9.244698222402584e-06, "loss": 0.7639, "step": 1001 }, { "epoch": 0.20213902731940728, "grad_norm": 0.6123541593551636, "learning_rate": 9.24297035564707e-06, "loss": 0.8269, "step": 1002 }, { "epoch": 0.20234076287561428, "grad_norm": 0.8308938145637512, "learning_rate": 9.241240676627472e-06, "loss": 0.7285, "step": 1003 }, { "epoch": 0.20254249843182126, "grad_norm": 1.161763072013855, "learning_rate": 9.239509186082574e-06, "loss": 0.6906, "step": 1004 }, { "epoch": 0.20274423398802827, "grad_norm": 0.8944821357727051, "learning_rate": 9.237775884751936e-06, "loss": 0.83, "step": 1005 }, { "epoch": 0.20294596954423524, "grad_norm": 0.6827241778373718, "learning_rate": 9.236040773375884e-06, "loss": 0.8459, "step": 1006 }, { "epoch": 0.20314770510044225, "grad_norm": 0.60840904712677, "learning_rate": 9.234303852695526e-06, "loss": 0.8371, "step": 1007 }, { "epoch": 0.20334944065664923, "grad_norm": 0.3988000452518463, "learning_rate": 9.232565123452734e-06, "loss": 0.6888, "step": 1008 }, { "epoch": 0.20355117621285623, "grad_norm": 0.7547544836997986, "learning_rate": 9.23082458639016e-06, "loss": 0.6639, "step": 1009 }, { "epoch": 0.2037529117690632, "grad_norm": 0.7112361192703247, "learning_rate": 9.229082242251222e-06, "loss": 0.7103, "step": 1010 }, { "epoch": 0.20395464732527022, "grad_norm": 0.4319564700126648, "learning_rate": 9.227338091780116e-06, "loss": 0.6841, "step": 1011 }, { "epoch": 0.20415638288147722, "grad_norm": 0.4466502070426941, "learning_rate": 9.225592135721802e-06, "loss": 0.8465, "step": 1012 }, { "epoch": 0.2043581184376842, "grad_norm": 0.7582955956459045, "learning_rate": 9.22384437482202e-06, "loss": 0.8686, "step": 1013 }, { "epoch": 0.2045598539938912, "grad_norm": 0.9325213432312012, "learning_rate": 9.222094809827272e-06, "loss": 0.7334, "step": 1014 }, { "epoch": 0.20476158955009818, "grad_norm": 0.841929018497467, "learning_rate": 9.220343441484837e-06, "loss": 0.6743, "step": 1015 }, { "epoch": 0.2049633251063052, "grad_norm": 0.3721573054790497, "learning_rate": 9.218590270542765e-06, "loss": 1.0676, "step": 1016 }, { "epoch": 0.20516506066251217, "grad_norm": 0.509548008441925, "learning_rate": 9.216835297749869e-06, "loss": 0.706, "step": 1017 }, { "epoch": 0.20536679621871917, "grad_norm": 0.5040112137794495, "learning_rate": 9.215078523855736e-06, "loss": 0.7995, "step": 1018 }, { "epoch": 0.20556853177492615, "grad_norm": 0.570552408695221, "learning_rate": 9.213319949610727e-06, "loss": 0.6729, "step": 1019 }, { "epoch": 0.20577026733113316, "grad_norm": 0.6146701574325562, "learning_rate": 9.211559575765958e-06, "loss": 0.6371, "step": 1020 }, { "epoch": 0.20597200288734016, "grad_norm": 0.4954344630241394, "learning_rate": 9.209797403073331e-06, "loss": 0.7039, "step": 1021 }, { "epoch": 0.20617373844354714, "grad_norm": 0.4436202049255371, "learning_rate": 9.208033432285503e-06, "loss": 0.7906, "step": 1022 }, { "epoch": 0.20637547399975414, "grad_norm": 0.4960552453994751, "learning_rate": 9.206267664155906e-06, "loss": 0.6918, "step": 1023 }, { "epoch": 0.20657720955596112, "grad_norm": 0.47839146852493286, "learning_rate": 9.204500099438739e-06, "loss": 0.819, "step": 1024 }, { "epoch": 0.20677894511216813, "grad_norm": 0.5067861676216125, "learning_rate": 9.202730738888962e-06, "loss": 0.6683, "step": 1025 }, { "epoch": 0.2069806806683751, "grad_norm": 0.4200279414653778, "learning_rate": 9.200959583262312e-06, "loss": 0.6968, "step": 1026 }, { "epoch": 0.2071824162245821, "grad_norm": 0.49858543276786804, "learning_rate": 9.199186633315286e-06, "loss": 0.898, "step": 1027 }, { "epoch": 0.2073841517807891, "grad_norm": 1.0465232133865356, "learning_rate": 9.197411889805148e-06, "loss": 0.7353, "step": 1028 }, { "epoch": 0.2075858873369961, "grad_norm": 0.8756152391433716, "learning_rate": 9.195635353489932e-06, "loss": 0.698, "step": 1029 }, { "epoch": 0.2077876228932031, "grad_norm": 1.9849929809570312, "learning_rate": 9.193857025128431e-06, "loss": 0.713, "step": 1030 }, { "epoch": 0.20798935844941008, "grad_norm": 1.5864392518997192, "learning_rate": 9.19207690548021e-06, "loss": 0.7083, "step": 1031 }, { "epoch": 0.20819109400561708, "grad_norm": 1.4301296472549438, "learning_rate": 9.190294995305598e-06, "loss": 0.708, "step": 1032 }, { "epoch": 0.20839282956182406, "grad_norm": 0.43994635343551636, "learning_rate": 9.188511295365683e-06, "loss": 0.6809, "step": 1033 }, { "epoch": 0.20859456511803107, "grad_norm": 0.9496873021125793, "learning_rate": 9.186725806422325e-06, "loss": 0.6739, "step": 1034 }, { "epoch": 0.20879630067423804, "grad_norm": 0.5276889801025391, "learning_rate": 9.184938529238144e-06, "loss": 0.7, "step": 1035 }, { "epoch": 0.20899803623044505, "grad_norm": 0.45405182242393494, "learning_rate": 9.183149464576524e-06, "loss": 0.687, "step": 1036 }, { "epoch": 0.20919977178665203, "grad_norm": 0.5376430749893188, "learning_rate": 9.181358613201613e-06, "loss": 0.6767, "step": 1037 }, { "epoch": 0.20940150734285903, "grad_norm": 0.6030016541481018, "learning_rate": 9.179565975878324e-06, "loss": 0.8204, "step": 1038 }, { "epoch": 0.20960324289906604, "grad_norm": 0.5121897459030151, "learning_rate": 9.177771553372328e-06, "loss": 0.7511, "step": 1039 }, { "epoch": 0.20980497845527302, "grad_norm": 0.5720887184143066, "learning_rate": 9.175975346450063e-06, "loss": 0.8343, "step": 1040 }, { "epoch": 0.21000671401148002, "grad_norm": 0.7431820034980774, "learning_rate": 9.174177355878731e-06, "loss": 0.7304, "step": 1041 }, { "epoch": 0.210208449567687, "grad_norm": 0.49612024426460266, "learning_rate": 9.172377582426286e-06, "loss": 0.6773, "step": 1042 }, { "epoch": 0.210410185123894, "grad_norm": 0.45197567343711853, "learning_rate": 9.170576026861455e-06, "loss": 0.6998, "step": 1043 }, { "epoch": 0.21061192068010098, "grad_norm": 0.8435342311859131, "learning_rate": 9.16877268995372e-06, "loss": 0.7114, "step": 1044 }, { "epoch": 0.210813656236308, "grad_norm": 0.961377739906311, "learning_rate": 9.166967572473325e-06, "loss": 0.7085, "step": 1045 }, { "epoch": 0.21101539179251497, "grad_norm": 0.6751661896705627, "learning_rate": 9.165160675191272e-06, "loss": 0.8048, "step": 1046 }, { "epoch": 0.21121712734872197, "grad_norm": 0.9755600690841675, "learning_rate": 9.163351998879331e-06, "loss": 0.7424, "step": 1047 }, { "epoch": 0.21141886290492898, "grad_norm": 0.9335680603981018, "learning_rate": 9.161541544310022e-06, "loss": 0.6775, "step": 1048 }, { "epoch": 0.21162059846113596, "grad_norm": 0.6090373396873474, "learning_rate": 9.159729312256632e-06, "loss": 0.6716, "step": 1049 }, { "epoch": 0.21182233401734296, "grad_norm": 0.7214455604553223, "learning_rate": 9.157915303493201e-06, "loss": 0.7575, "step": 1050 }, { "epoch": 0.21202406957354994, "grad_norm": 0.7865515947341919, "learning_rate": 9.156099518794535e-06, "loss": 0.6964, "step": 1051 }, { "epoch": 0.21222580512975694, "grad_norm": 1.348393440246582, "learning_rate": 9.154281958936194e-06, "loss": 0.7956, "step": 1052 }, { "epoch": 0.21242754068596392, "grad_norm": 1.2331461906433105, "learning_rate": 9.152462624694495e-06, "loss": 0.8917, "step": 1053 }, { "epoch": 0.21262927624217093, "grad_norm": 0.6964088678359985, "learning_rate": 9.150641516846517e-06, "loss": 0.7055, "step": 1054 }, { "epoch": 0.21283101179837793, "grad_norm": 0.4619287848472595, "learning_rate": 9.148818636170092e-06, "loss": 0.699, "step": 1055 }, { "epoch": 0.2130327473545849, "grad_norm": 1.3972951173782349, "learning_rate": 9.146993983443815e-06, "loss": 0.681, "step": 1056 }, { "epoch": 0.21323448291079192, "grad_norm": 0.4713950455188751, "learning_rate": 9.145167559447032e-06, "loss": 0.6583, "step": 1057 }, { "epoch": 0.2134362184669989, "grad_norm": 0.5854702591896057, "learning_rate": 9.143339364959849e-06, "loss": 0.8043, "step": 1058 }, { "epoch": 0.2136379540232059, "grad_norm": 0.4331769347190857, "learning_rate": 9.141509400763127e-06, "loss": 0.7089, "step": 1059 }, { "epoch": 0.21383968957941288, "grad_norm": 0.4095942974090576, "learning_rate": 9.139677667638481e-06, "loss": 0.6899, "step": 1060 }, { "epoch": 0.21404142513561988, "grad_norm": 0.7834404706954956, "learning_rate": 9.137844166368289e-06, "loss": 0.6861, "step": 1061 }, { "epoch": 0.21424316069182686, "grad_norm": 4.768156051635742, "learning_rate": 9.136008897735673e-06, "loss": 0.7185, "step": 1062 }, { "epoch": 0.21444489624803387, "grad_norm": 3.4436988830566406, "learning_rate": 9.13417186252452e-06, "loss": 0.7006, "step": 1063 }, { "epoch": 0.21464663180424087, "grad_norm": 0.5718343257904053, "learning_rate": 9.132333061519465e-06, "loss": 0.7174, "step": 1064 }, { "epoch": 0.21484836736044785, "grad_norm": 0.7323482632637024, "learning_rate": 9.130492495505902e-06, "loss": 0.6881, "step": 1065 }, { "epoch": 0.21505010291665486, "grad_norm": 0.48052531480789185, "learning_rate": 9.128650165269973e-06, "loss": 0.7082, "step": 1066 }, { "epoch": 0.21525183847286183, "grad_norm": 0.4008042514324188, "learning_rate": 9.126806071598579e-06, "loss": 0.7354, "step": 1067 }, { "epoch": 0.21545357402906884, "grad_norm": 1.4962018728256226, "learning_rate": 9.124960215279372e-06, "loss": 0.8422, "step": 1068 }, { "epoch": 0.21565530958527582, "grad_norm": 0.5466567277908325, "learning_rate": 9.123112597100759e-06, "loss": 0.7146, "step": 1069 }, { "epoch": 0.21585704514148282, "grad_norm": 0.8619051575660706, "learning_rate": 9.121263217851892e-06, "loss": 0.7536, "step": 1070 }, { "epoch": 0.2160587806976898, "grad_norm": 0.5741105675697327, "learning_rate": 9.119412078322688e-06, "loss": 0.6986, "step": 1071 }, { "epoch": 0.2162605162538968, "grad_norm": 0.5464684963226318, "learning_rate": 9.1175591793038e-06, "loss": 0.7119, "step": 1072 }, { "epoch": 0.2164622518101038, "grad_norm": 0.5449472665786743, "learning_rate": 9.11570452158665e-06, "loss": 0.7213, "step": 1073 }, { "epoch": 0.2166639873663108, "grad_norm": 0.830302357673645, "learning_rate": 9.113848105963397e-06, "loss": 0.7419, "step": 1074 }, { "epoch": 0.2168657229225178, "grad_norm": 0.5640235543251038, "learning_rate": 9.111989933226957e-06, "loss": 0.638, "step": 1075 }, { "epoch": 0.21706745847872477, "grad_norm": 0.990414559841156, "learning_rate": 9.110130004170995e-06, "loss": 0.7019, "step": 1076 }, { "epoch": 0.21726919403493178, "grad_norm": 0.6244648694992065, "learning_rate": 9.108268319589928e-06, "loss": 0.7039, "step": 1077 }, { "epoch": 0.21747092959113876, "grad_norm": 0.7823309302330017, "learning_rate": 9.106404880278923e-06, "loss": 0.766, "step": 1078 }, { "epoch": 0.21767266514734576, "grad_norm": 0.5949699282646179, "learning_rate": 9.104539687033891e-06, "loss": 0.8078, "step": 1079 }, { "epoch": 0.21787440070355274, "grad_norm": 0.4364759922027588, "learning_rate": 9.1026727406515e-06, "loss": 0.7588, "step": 1080 }, { "epoch": 0.21807613625975975, "grad_norm": 1.3297545909881592, "learning_rate": 9.100804041929161e-06, "loss": 0.7108, "step": 1081 }, { "epoch": 0.21827787181596675, "grad_norm": 0.5728005766868591, "learning_rate": 9.098933591665037e-06, "loss": 0.8183, "step": 1082 }, { "epoch": 0.21847960737217373, "grad_norm": 0.559471607208252, "learning_rate": 9.097061390658036e-06, "loss": 0.8388, "step": 1083 }, { "epoch": 0.21868134292838073, "grad_norm": 0.6557266116142273, "learning_rate": 9.095187439707817e-06, "loss": 0.6565, "step": 1084 }, { "epoch": 0.2188830784845877, "grad_norm": 1.3041290044784546, "learning_rate": 9.093311739614783e-06, "loss": 0.677, "step": 1085 }, { "epoch": 0.21908481404079472, "grad_norm": 0.4473462700843811, "learning_rate": 9.091434291180088e-06, "loss": 0.6987, "step": 1086 }, { "epoch": 0.2192865495970017, "grad_norm": 0.5305896401405334, "learning_rate": 9.08955509520563e-06, "loss": 0.7524, "step": 1087 }, { "epoch": 0.2194882851532087, "grad_norm": 0.6427012085914612, "learning_rate": 9.087674152494052e-06, "loss": 0.8808, "step": 1088 }, { "epoch": 0.21969002070941568, "grad_norm": 0.9214508533477783, "learning_rate": 9.085791463848748e-06, "loss": 0.649, "step": 1089 }, { "epoch": 0.21989175626562268, "grad_norm": 1.6634055376052856, "learning_rate": 9.083907030073853e-06, "loss": 0.8284, "step": 1090 }, { "epoch": 0.2200934918218297, "grad_norm": 0.5022058486938477, "learning_rate": 9.08202085197425e-06, "loss": 0.6859, "step": 1091 }, { "epoch": 0.22029522737803667, "grad_norm": 0.5239662528038025, "learning_rate": 9.080132930355567e-06, "loss": 0.6801, "step": 1092 }, { "epoch": 0.22049696293424367, "grad_norm": 0.5457141995429993, "learning_rate": 9.078243266024177e-06, "loss": 0.8468, "step": 1093 }, { "epoch": 0.22069869849045065, "grad_norm": 2.5816259384155273, "learning_rate": 9.076351859787191e-06, "loss": 0.7319, "step": 1094 }, { "epoch": 0.22090043404665766, "grad_norm": 1.3747007846832275, "learning_rate": 9.074458712452476e-06, "loss": 0.6986, "step": 1095 }, { "epoch": 0.22110216960286463, "grad_norm": 1.4433778524398804, "learning_rate": 9.072563824828631e-06, "loss": 0.7989, "step": 1096 }, { "epoch": 0.22130390515907164, "grad_norm": 1.1537508964538574, "learning_rate": 9.070667197725007e-06, "loss": 0.8272, "step": 1097 }, { "epoch": 0.22150564071527862, "grad_norm": 0.5916514992713928, "learning_rate": 9.068768831951693e-06, "loss": 0.758, "step": 1098 }, { "epoch": 0.22170737627148562, "grad_norm": 1.487269639968872, "learning_rate": 9.066868728319522e-06, "loss": 0.8429, "step": 1099 }, { "epoch": 0.22190911182769263, "grad_norm": 1.0207079648971558, "learning_rate": 9.064966887640068e-06, "loss": 0.7288, "step": 1100 }, { "epoch": 0.2221108473838996, "grad_norm": 0.4363601505756378, "learning_rate": 9.06306331072565e-06, "loss": 0.6922, "step": 1101 }, { "epoch": 0.2223125829401066, "grad_norm": 0.7018163800239563, "learning_rate": 9.061157998389325e-06, "loss": 0.6636, "step": 1102 }, { "epoch": 0.2225143184963136, "grad_norm": 0.6163290739059448, "learning_rate": 9.059250951444894e-06, "loss": 0.7651, "step": 1103 }, { "epoch": 0.2227160540525206, "grad_norm": 0.7058590054512024, "learning_rate": 9.057342170706897e-06, "loss": 0.7247, "step": 1104 }, { "epoch": 0.22291778960872757, "grad_norm": 0.9741607904434204, "learning_rate": 9.055431656990617e-06, "loss": 0.6887, "step": 1105 }, { "epoch": 0.22311952516493458, "grad_norm": 0.6746135354042053, "learning_rate": 9.053519411112075e-06, "loss": 0.7096, "step": 1106 }, { "epoch": 0.22332126072114158, "grad_norm": 0.7209685444831848, "learning_rate": 9.051605433888031e-06, "loss": 0.6607, "step": 1107 }, { "epoch": 0.22352299627734856, "grad_norm": 1.3863085508346558, "learning_rate": 9.049689726135988e-06, "loss": 0.7675, "step": 1108 }, { "epoch": 0.22372473183355557, "grad_norm": 0.8169994354248047, "learning_rate": 9.047772288674183e-06, "loss": 0.6838, "step": 1109 }, { "epoch": 0.22392646738976255, "grad_norm": 0.48424163460731506, "learning_rate": 9.045853122321599e-06, "loss": 0.8101, "step": 1110 }, { "epoch": 0.22412820294596955, "grad_norm": 0.3474283814430237, "learning_rate": 9.04393222789795e-06, "loss": 0.6836, "step": 1111 }, { "epoch": 0.22432993850217653, "grad_norm": 0.47879624366760254, "learning_rate": 9.042009606223693e-06, "loss": 0.7023, "step": 1112 }, { "epoch": 0.22453167405838353, "grad_norm": 0.7166565656661987, "learning_rate": 9.040085258120022e-06, "loss": 0.8499, "step": 1113 }, { "epoch": 0.2247334096145905, "grad_norm": 1.2024344205856323, "learning_rate": 9.038159184408863e-06, "loss": 0.745, "step": 1114 }, { "epoch": 0.22493514517079752, "grad_norm": 1.2393633127212524, "learning_rate": 9.03623138591289e-06, "loss": 0.8218, "step": 1115 }, { "epoch": 0.22513688072700452, "grad_norm": 0.4706745147705078, "learning_rate": 9.034301863455504e-06, "loss": 0.7126, "step": 1116 }, { "epoch": 0.2253386162832115, "grad_norm": 1.2676310539245605, "learning_rate": 9.032370617860844e-06, "loss": 0.7014, "step": 1117 }, { "epoch": 0.2255403518394185, "grad_norm": 0.6443676352500916, "learning_rate": 9.03043764995379e-06, "loss": 0.6831, "step": 1118 }, { "epoch": 0.22574208739562548, "grad_norm": 1.2037488222122192, "learning_rate": 9.028502960559952e-06, "loss": 0.6523, "step": 1119 }, { "epoch": 0.2259438229518325, "grad_norm": 0.8409974575042725, "learning_rate": 9.026566550505677e-06, "loss": 0.689, "step": 1120 }, { "epoch": 0.22614555850803947, "grad_norm": 0.48111411929130554, "learning_rate": 9.02462842061805e-06, "loss": 0.6924, "step": 1121 }, { "epoch": 0.22634729406424647, "grad_norm": 0.8593577146530151, "learning_rate": 9.022688571724888e-06, "loss": 0.6815, "step": 1122 }, { "epoch": 0.22654902962045345, "grad_norm": 0.8607560396194458, "learning_rate": 9.02074700465474e-06, "loss": 0.7054, "step": 1123 }, { "epoch": 0.22675076517666046, "grad_norm": 0.4094844460487366, "learning_rate": 9.018803720236891e-06, "loss": 0.7117, "step": 1124 }, { "epoch": 0.22695250073286746, "grad_norm": 0.4774841070175171, "learning_rate": 9.016858719301363e-06, "loss": 0.6442, "step": 1125 }, { "epoch": 0.22715423628907444, "grad_norm": 0.7203394174575806, "learning_rate": 9.014912002678905e-06, "loss": 0.7519, "step": 1126 }, { "epoch": 0.22735597184528145, "grad_norm": 0.4779406189918518, "learning_rate": 9.012963571200998e-06, "loss": 0.7155, "step": 1127 }, { "epoch": 0.22755770740148842, "grad_norm": 1.0806742906570435, "learning_rate": 9.011013425699868e-06, "loss": 0.6541, "step": 1128 }, { "epoch": 0.22775944295769543, "grad_norm": 0.3512164354324341, "learning_rate": 9.00906156700846e-06, "loss": 0.7335, "step": 1129 }, { "epoch": 0.2279611785139024, "grad_norm": 0.7348980903625488, "learning_rate": 9.007107995960452e-06, "loss": 0.6764, "step": 1130 }, { "epoch": 0.2281629140701094, "grad_norm": 0.45355191826820374, "learning_rate": 9.005152713390259e-06, "loss": 0.7835, "step": 1131 }, { "epoch": 0.2283646496263164, "grad_norm": 0.4727195203304291, "learning_rate": 9.003195720133024e-06, "loss": 0.6377, "step": 1132 }, { "epoch": 0.2285663851825234, "grad_norm": 1.2743937969207764, "learning_rate": 9.001237017024621e-06, "loss": 0.6838, "step": 1133 }, { "epoch": 0.2287681207387304, "grad_norm": 0.3683013916015625, "learning_rate": 8.999276604901654e-06, "loss": 0.6876, "step": 1134 }, { "epoch": 0.22896985629493738, "grad_norm": 0.652779757976532, "learning_rate": 8.997314484601458e-06, "loss": 0.6726, "step": 1135 }, { "epoch": 0.22917159185114439, "grad_norm": 1.0614265203475952, "learning_rate": 8.995350656962098e-06, "loss": 0.6525, "step": 1136 }, { "epoch": 0.22937332740735136, "grad_norm": 0.6317399740219116, "learning_rate": 8.993385122822364e-06, "loss": 0.7065, "step": 1137 }, { "epoch": 0.22957506296355837, "grad_norm": 0.37013980746269226, "learning_rate": 8.99141788302178e-06, "loss": 0.6774, "step": 1138 }, { "epoch": 0.22977679851976535, "grad_norm": 0.40387916564941406, "learning_rate": 8.989448938400596e-06, "loss": 0.6923, "step": 1139 }, { "epoch": 0.22997853407597235, "grad_norm": 0.4020114839076996, "learning_rate": 8.987478289799792e-06, "loss": 0.6752, "step": 1140 }, { "epoch": 0.23018026963217933, "grad_norm": 0.7502828240394592, "learning_rate": 8.98550593806107e-06, "loss": 0.6412, "step": 1141 }, { "epoch": 0.23038200518838634, "grad_norm": 0.4360479414463043, "learning_rate": 8.98353188402687e-06, "loss": 0.6961, "step": 1142 }, { "epoch": 0.23058374074459334, "grad_norm": 0.7076632976531982, "learning_rate": 8.98155612854035e-06, "loss": 0.7323, "step": 1143 }, { "epoch": 0.23078547630080032, "grad_norm": 0.4245670735836029, "learning_rate": 8.979578672445397e-06, "loss": 0.6964, "step": 1144 }, { "epoch": 0.23098721185700732, "grad_norm": 0.4887491464614868, "learning_rate": 8.977599516586625e-06, "loss": 0.8195, "step": 1145 }, { "epoch": 0.2311889474132143, "grad_norm": 0.816503643989563, "learning_rate": 8.975618661809378e-06, "loss": 0.7633, "step": 1146 }, { "epoch": 0.2313906829694213, "grad_norm": 0.37157323956489563, "learning_rate": 8.973636108959718e-06, "loss": 0.7864, "step": 1147 }, { "epoch": 0.23159241852562829, "grad_norm": 2.1369481086730957, "learning_rate": 8.971651858884436e-06, "loss": 0.7709, "step": 1148 }, { "epoch": 0.2317941540818353, "grad_norm": 0.6506975889205933, "learning_rate": 8.969665912431049e-06, "loss": 0.9078, "step": 1149 }, { "epoch": 0.2319958896380423, "grad_norm": 0.3176805377006531, "learning_rate": 8.9676782704478e-06, "loss": 0.6704, "step": 1150 }, { "epoch": 0.23219762519424927, "grad_norm": 0.45958906412124634, "learning_rate": 8.965688933783648e-06, "loss": 0.8634, "step": 1151 }, { "epoch": 0.23239936075045628, "grad_norm": 0.4763012230396271, "learning_rate": 8.963697903288287e-06, "loss": 0.7168, "step": 1152 }, { "epoch": 0.23260109630666326, "grad_norm": 0.3679403066635132, "learning_rate": 8.961705179812126e-06, "loss": 0.6559, "step": 1153 }, { "epoch": 0.23280283186287026, "grad_norm": 1.3070601224899292, "learning_rate": 8.9597107642063e-06, "loss": 0.7322, "step": 1154 }, { "epoch": 0.23300456741907724, "grad_norm": 1.5573060512542725, "learning_rate": 8.957714657322669e-06, "loss": 0.8178, "step": 1155 }, { "epoch": 0.23320630297528425, "grad_norm": 1.0047229528427124, "learning_rate": 8.955716860013812e-06, "loss": 0.9026, "step": 1156 }, { "epoch": 0.23340803853149122, "grad_norm": 0.44546905159950256, "learning_rate": 8.953717373133031e-06, "loss": 0.6928, "step": 1157 }, { "epoch": 0.23360977408769823, "grad_norm": 0.6220713257789612, "learning_rate": 8.95171619753435e-06, "loss": 0.6822, "step": 1158 }, { "epoch": 0.23381150964390524, "grad_norm": 0.8338699340820312, "learning_rate": 8.949713334072516e-06, "loss": 0.7441, "step": 1159 }, { "epoch": 0.2340132452001122, "grad_norm": 0.8864205479621887, "learning_rate": 8.947708783602993e-06, "loss": 0.676, "step": 1160 }, { "epoch": 0.23421498075631922, "grad_norm": 0.6396302580833435, "learning_rate": 8.94570254698197e-06, "loss": 0.7031, "step": 1161 }, { "epoch": 0.2344167163125262, "grad_norm": 0.4238170385360718, "learning_rate": 8.94369462506635e-06, "loss": 0.6581, "step": 1162 }, { "epoch": 0.2346184518687332, "grad_norm": 0.36502161622047424, "learning_rate": 8.941685018713762e-06, "loss": 0.7065, "step": 1163 }, { "epoch": 0.23482018742494018, "grad_norm": 0.5333518385887146, "learning_rate": 8.939673728782555e-06, "loss": 0.7189, "step": 1164 }, { "epoch": 0.23502192298114719, "grad_norm": 0.4384578466415405, "learning_rate": 8.937660756131789e-06, "loss": 0.8163, "step": 1165 }, { "epoch": 0.23522365853735416, "grad_norm": 0.4460729658603668, "learning_rate": 8.935646101621252e-06, "loss": 0.6813, "step": 1166 }, { "epoch": 0.23542539409356117, "grad_norm": 0.53566974401474, "learning_rate": 8.933629766111443e-06, "loss": 0.6833, "step": 1167 }, { "epoch": 0.23562712964976817, "grad_norm": 0.4094941020011902, "learning_rate": 8.931611750463586e-06, "loss": 0.6722, "step": 1168 }, { "epoch": 0.23582886520597515, "grad_norm": 0.462923139333725, "learning_rate": 8.929592055539615e-06, "loss": 0.8593, "step": 1169 }, { "epoch": 0.23603060076218216, "grad_norm": 0.5280745029449463, "learning_rate": 8.92757068220219e-06, "loss": 0.6685, "step": 1170 }, { "epoch": 0.23623233631838914, "grad_norm": 0.5644258856773376, "learning_rate": 8.925547631314679e-06, "loss": 0.6895, "step": 1171 }, { "epoch": 0.23643407187459614, "grad_norm": 0.4548441469669342, "learning_rate": 8.923522903741173e-06, "loss": 0.6942, "step": 1172 }, { "epoch": 0.23663580743080312, "grad_norm": 0.48022985458374023, "learning_rate": 8.921496500346477e-06, "loss": 0.6608, "step": 1173 }, { "epoch": 0.23683754298701012, "grad_norm": 0.5295478105545044, "learning_rate": 8.91946842199611e-06, "loss": 0.6691, "step": 1174 }, { "epoch": 0.2370392785432171, "grad_norm": 0.4129369854927063, "learning_rate": 8.917438669556307e-06, "loss": 0.6763, "step": 1175 }, { "epoch": 0.2372410140994241, "grad_norm": 0.4060460329055786, "learning_rate": 8.915407243894022e-06, "loss": 0.725, "step": 1176 }, { "epoch": 0.2374427496556311, "grad_norm": 0.8107428550720215, "learning_rate": 8.913374145876918e-06, "loss": 0.8017, "step": 1177 }, { "epoch": 0.2376444852118381, "grad_norm": 0.35032761096954346, "learning_rate": 8.911339376373377e-06, "loss": 0.7125, "step": 1178 }, { "epoch": 0.2378462207680451, "grad_norm": 1.233778476715088, "learning_rate": 8.909302936252491e-06, "loss": 0.7081, "step": 1179 }, { "epoch": 0.23804795632425207, "grad_norm": 0.5814195871353149, "learning_rate": 8.90726482638407e-06, "loss": 0.6855, "step": 1180 }, { "epoch": 0.23824969188045908, "grad_norm": 0.5171557664871216, "learning_rate": 8.905225047638633e-06, "loss": 0.6714, "step": 1181 }, { "epoch": 0.23845142743666606, "grad_norm": 1.1142185926437378, "learning_rate": 8.903183600887412e-06, "loss": 1.0363, "step": 1182 }, { "epoch": 0.23865316299287306, "grad_norm": 2.59897518157959, "learning_rate": 8.901140487002358e-06, "loss": 0.795, "step": 1183 }, { "epoch": 0.23885489854908004, "grad_norm": 3.8224384784698486, "learning_rate": 8.899095706856122e-06, "loss": 0.7541, "step": 1184 }, { "epoch": 0.23905663410528705, "grad_norm": 0.720397412776947, "learning_rate": 8.897049261322079e-06, "loss": 0.6947, "step": 1185 }, { "epoch": 0.23925836966149405, "grad_norm": 3.830878734588623, "learning_rate": 8.895001151274309e-06, "loss": 0.7975, "step": 1186 }, { "epoch": 0.23946010521770103, "grad_norm": 0.5359489321708679, "learning_rate": 8.892951377587602e-06, "loss": 0.6846, "step": 1187 }, { "epoch": 0.23966184077390804, "grad_norm": 0.6558648943901062, "learning_rate": 8.890899941137461e-06, "loss": 0.6896, "step": 1188 }, { "epoch": 0.239863576330115, "grad_norm": 0.8913664221763611, "learning_rate": 8.888846842800101e-06, "loss": 0.7391, "step": 1189 }, { "epoch": 0.24006531188632202, "grad_norm": 0.5871713757514954, "learning_rate": 8.886792083452443e-06, "loss": 0.8685, "step": 1190 }, { "epoch": 0.240267047442529, "grad_norm": 0.4375171959400177, "learning_rate": 8.884735663972118e-06, "loss": 0.7846, "step": 1191 }, { "epoch": 0.240468782998736, "grad_norm": 1.61861252784729, "learning_rate": 8.882677585237467e-06, "loss": 0.6952, "step": 1192 }, { "epoch": 0.24067051855494298, "grad_norm": 0.9504277110099792, "learning_rate": 8.880617848127542e-06, "loss": 0.6648, "step": 1193 }, { "epoch": 0.24087225411114999, "grad_norm": 0.5535653233528137, "learning_rate": 8.8785564535221e-06, "loss": 0.7864, "step": 1194 }, { "epoch": 0.241073989667357, "grad_norm": 0.38245075941085815, "learning_rate": 8.876493402301606e-06, "loss": 0.6732, "step": 1195 }, { "epoch": 0.24127572522356397, "grad_norm": 0.43615174293518066, "learning_rate": 8.874428695347237e-06, "loss": 0.7012, "step": 1196 }, { "epoch": 0.24147746077977097, "grad_norm": 0.7267807722091675, "learning_rate": 8.872362333540869e-06, "loss": 0.6345, "step": 1197 }, { "epoch": 0.24167919633597795, "grad_norm": 0.5719030499458313, "learning_rate": 8.870294317765094e-06, "loss": 0.7034, "step": 1198 }, { "epoch": 0.24188093189218496, "grad_norm": 0.8841007351875305, "learning_rate": 8.868224648903203e-06, "loss": 0.707, "step": 1199 }, { "epoch": 0.24208266744839194, "grad_norm": 0.48409315943717957, "learning_rate": 8.866153327839198e-06, "loss": 0.7201, "step": 1200 }, { "epoch": 0.24228440300459894, "grad_norm": 0.7220364212989807, "learning_rate": 8.864080355457782e-06, "loss": 0.7586, "step": 1201 }, { "epoch": 0.24248613856080595, "grad_norm": 0.43595415353775024, "learning_rate": 8.862005732644373e-06, "loss": 0.6996, "step": 1202 }, { "epoch": 0.24268787411701293, "grad_norm": 0.4253355860710144, "learning_rate": 8.859929460285078e-06, "loss": 0.8073, "step": 1203 }, { "epoch": 0.24288960967321993, "grad_norm": 0.6659312844276428, "learning_rate": 8.857851539266724e-06, "loss": 0.6698, "step": 1204 }, { "epoch": 0.2430913452294269, "grad_norm": 0.3482789695262909, "learning_rate": 8.855771970476834e-06, "loss": 0.6306, "step": 1205 }, { "epoch": 0.24329308078563391, "grad_norm": 6.6193718910217285, "learning_rate": 8.853690754803638e-06, "loss": 0.6382, "step": 1206 }, { "epoch": 0.2434948163418409, "grad_norm": 0.38211026787757874, "learning_rate": 8.851607893136065e-06, "loss": 0.693, "step": 1207 }, { "epoch": 0.2436965518980479, "grad_norm": 0.7665725946426392, "learning_rate": 8.849523386363754e-06, "loss": 0.6995, "step": 1208 }, { "epoch": 0.24389828745425488, "grad_norm": 0.5308759212493896, "learning_rate": 8.84743723537704e-06, "loss": 0.7059, "step": 1209 }, { "epoch": 0.24410002301046188, "grad_norm": 0.8600336313247681, "learning_rate": 8.845349441066961e-06, "loss": 0.7524, "step": 1210 }, { "epoch": 0.2443017585666689, "grad_norm": 0.656417191028595, "learning_rate": 8.843260004325265e-06, "loss": 0.7514, "step": 1211 }, { "epoch": 0.24450349412287586, "grad_norm": 0.33181649446487427, "learning_rate": 8.84116892604439e-06, "loss": 0.6944, "step": 1212 }, { "epoch": 0.24470522967908287, "grad_norm": 1.0347340106964111, "learning_rate": 8.839076207117485e-06, "loss": 0.7253, "step": 1213 }, { "epoch": 0.24490696523528985, "grad_norm": 1.1775801181793213, "learning_rate": 8.83698184843839e-06, "loss": 0.6874, "step": 1214 }, { "epoch": 0.24510870079149685, "grad_norm": 0.4488203823566437, "learning_rate": 8.834885850901656e-06, "loss": 1.1538, "step": 1215 }, { "epoch": 0.24531043634770383, "grad_norm": 0.508878231048584, "learning_rate": 8.832788215402527e-06, "loss": 0.8184, "step": 1216 }, { "epoch": 0.24551217190391084, "grad_norm": 0.42795446515083313, "learning_rate": 8.830688942836946e-06, "loss": 0.6962, "step": 1217 }, { "epoch": 0.24571390746011781, "grad_norm": 0.7489685416221619, "learning_rate": 8.828588034101561e-06, "loss": 0.6939, "step": 1218 }, { "epoch": 0.24591564301632482, "grad_norm": 0.5963667035102844, "learning_rate": 8.826485490093714e-06, "loss": 0.6844, "step": 1219 }, { "epoch": 0.24611737857253183, "grad_norm": 0.5979509353637695, "learning_rate": 8.824381311711444e-06, "loss": 0.6856, "step": 1220 }, { "epoch": 0.2463191141287388, "grad_norm": 0.4428512454032898, "learning_rate": 8.822275499853497e-06, "loss": 0.6599, "step": 1221 }, { "epoch": 0.2465208496849458, "grad_norm": 0.47854629158973694, "learning_rate": 8.820168055419306e-06, "loss": 0.6481, "step": 1222 }, { "epoch": 0.2467225852411528, "grad_norm": 0.5568980574607849, "learning_rate": 8.818058979309007e-06, "loss": 0.6722, "step": 1223 }, { "epoch": 0.2469243207973598, "grad_norm": 0.5346736311912537, "learning_rate": 8.815948272423432e-06, "loss": 0.7013, "step": 1224 }, { "epoch": 0.24712605635356677, "grad_norm": 0.515552818775177, "learning_rate": 8.81383593566411e-06, "loss": 0.8875, "step": 1225 }, { "epoch": 0.24732779190977378, "grad_norm": 0.581317126750946, "learning_rate": 8.811721969933264e-06, "loss": 0.7057, "step": 1226 }, { "epoch": 0.24752952746598075, "grad_norm": 0.5679516792297363, "learning_rate": 8.809606376133814e-06, "loss": 0.701, "step": 1227 }, { "epoch": 0.24773126302218776, "grad_norm": 0.623017430305481, "learning_rate": 8.80748915516938e-06, "loss": 0.7434, "step": 1228 }, { "epoch": 0.24793299857839476, "grad_norm": 0.7623668313026428, "learning_rate": 8.805370307944268e-06, "loss": 0.6615, "step": 1229 }, { "epoch": 0.24813473413460174, "grad_norm": 0.4862426817417145, "learning_rate": 8.803249835363486e-06, "loss": 0.6661, "step": 1230 }, { "epoch": 0.24833646969080875, "grad_norm": 0.9921598434448242, "learning_rate": 8.801127738332731e-06, "loss": 0.7617, "step": 1231 }, { "epoch": 0.24853820524701573, "grad_norm": 0.4795064926147461, "learning_rate": 8.7990040177584e-06, "loss": 0.8968, "step": 1232 }, { "epoch": 0.24873994080322273, "grad_norm": 2.262944221496582, "learning_rate": 8.796878674547578e-06, "loss": 0.6576, "step": 1233 }, { "epoch": 0.2489416763594297, "grad_norm": 1.5917365550994873, "learning_rate": 8.794751709608042e-06, "loss": 0.8318, "step": 1234 }, { "epoch": 0.24914341191563671, "grad_norm": 0.45491623878479004, "learning_rate": 8.79262312384827e-06, "loss": 0.6919, "step": 1235 }, { "epoch": 0.2493451474718437, "grad_norm": 0.7376951575279236, "learning_rate": 8.790492918177423e-06, "loss": 0.7886, "step": 1236 }, { "epoch": 0.2495468830280507, "grad_norm": 0.6271589994430542, "learning_rate": 8.788361093505358e-06, "loss": 0.6272, "step": 1237 }, { "epoch": 0.2497486185842577, "grad_norm": 0.5672959685325623, "learning_rate": 8.786227650742624e-06, "loss": 0.6987, "step": 1238 }, { "epoch": 0.24995035414046468, "grad_norm": 0.41392165422439575, "learning_rate": 8.784092590800462e-06, "loss": 0.6873, "step": 1239 }, { "epoch": 0.2501520896966717, "grad_norm": 0.5560492873191833, "learning_rate": 8.781955914590801e-06, "loss": 0.6692, "step": 1240 }, { "epoch": 0.2503538252528787, "grad_norm": 1.9556437730789185, "learning_rate": 8.77981762302626e-06, "loss": 0.7373, "step": 1241 }, { "epoch": 0.25055556080908564, "grad_norm": 0.4091489613056183, "learning_rate": 8.77767771702015e-06, "loss": 0.6626, "step": 1242 }, { "epoch": 0.25075729636529265, "grad_norm": 0.693080723285675, "learning_rate": 8.775536197486471e-06, "loss": 0.6866, "step": 1243 }, { "epoch": 0.25095903192149965, "grad_norm": 0.6191006898880005, "learning_rate": 8.773393065339915e-06, "loss": 0.6609, "step": 1244 }, { "epoch": 0.25116076747770666, "grad_norm": 0.4202521741390228, "learning_rate": 8.771248321495856e-06, "loss": 0.7482, "step": 1245 }, { "epoch": 0.25136250303391366, "grad_norm": 0.4725857377052307, "learning_rate": 8.769101966870362e-06, "loss": 0.8326, "step": 1246 }, { "epoch": 0.2515642385901206, "grad_norm": 0.6367688775062561, "learning_rate": 8.766954002380188e-06, "loss": 0.6573, "step": 1247 }, { "epoch": 0.2517659741463276, "grad_norm": 0.3952287435531616, "learning_rate": 8.764804428942774e-06, "loss": 0.6654, "step": 1248 }, { "epoch": 0.2519677097025346, "grad_norm": 0.9058800935745239, "learning_rate": 8.762653247476249e-06, "loss": 0.6637, "step": 1249 }, { "epoch": 0.25216944525874163, "grad_norm": 0.42118096351623535, "learning_rate": 8.760500458899432e-06, "loss": 0.6979, "step": 1250 }, { "epoch": 0.2523711808149486, "grad_norm": 0.3834727704524994, "learning_rate": 8.758346064131824e-06, "loss": 0.6721, "step": 1251 }, { "epoch": 0.2525729163711556, "grad_norm": 0.7268792390823364, "learning_rate": 8.756190064093613e-06, "loss": 0.6441, "step": 1252 }, { "epoch": 0.2527746519273626, "grad_norm": 0.3844261169433594, "learning_rate": 8.754032459705672e-06, "loss": 0.677, "step": 1253 }, { "epoch": 0.2529763874835696, "grad_norm": 1.0050066709518433, "learning_rate": 8.751873251889563e-06, "loss": 0.6986, "step": 1254 }, { "epoch": 0.2531781230397766, "grad_norm": 0.4152543544769287, "learning_rate": 8.749712441567526e-06, "loss": 0.7067, "step": 1255 }, { "epoch": 0.25337985859598355, "grad_norm": 0.9145545363426208, "learning_rate": 8.747550029662493e-06, "loss": 0.6727, "step": 1256 }, { "epoch": 0.25358159415219056, "grad_norm": 0.5626360177993774, "learning_rate": 8.74538601709808e-06, "loss": 0.7, "step": 1257 }, { "epoch": 0.25378332970839756, "grad_norm": 0.9599636197090149, "learning_rate": 8.743220404798573e-06, "loss": 0.731, "step": 1258 }, { "epoch": 0.25398506526460457, "grad_norm": 0.6157480478286743, "learning_rate": 8.741053193688964e-06, "loss": 0.6644, "step": 1259 }, { "epoch": 0.2541868008208115, "grad_norm": 0.6961658596992493, "learning_rate": 8.738884384694905e-06, "loss": 0.6765, "step": 1260 }, { "epoch": 0.2543885363770185, "grad_norm": 1.0447839498519897, "learning_rate": 8.73671397874275e-06, "loss": 0.7055, "step": 1261 }, { "epoch": 0.25459027193322553, "grad_norm": 0.5891323685646057, "learning_rate": 8.734541976759519e-06, "loss": 0.7109, "step": 1262 }, { "epoch": 0.25479200748943254, "grad_norm": 1.4431084394454956, "learning_rate": 8.732368379672924e-06, "loss": 0.844, "step": 1263 }, { "epoch": 0.25499374304563954, "grad_norm": 0.45965614914894104, "learning_rate": 8.730193188411355e-06, "loss": 0.6844, "step": 1264 }, { "epoch": 0.2551954786018465, "grad_norm": 0.7861972451210022, "learning_rate": 8.728016403903884e-06, "loss": 0.6699, "step": 1265 }, { "epoch": 0.2553972141580535, "grad_norm": 1.466948390007019, "learning_rate": 8.725838027080261e-06, "loss": 0.7904, "step": 1266 }, { "epoch": 0.2555989497142605, "grad_norm": 0.543451726436615, "learning_rate": 8.723658058870919e-06, "loss": 0.8914, "step": 1267 }, { "epoch": 0.2558006852704675, "grad_norm": 0.5557923913002014, "learning_rate": 8.721476500206968e-06, "loss": 0.6768, "step": 1268 }, { "epoch": 0.25600242082667446, "grad_norm": 0.37838971614837646, "learning_rate": 8.7192933520202e-06, "loss": 0.6898, "step": 1269 }, { "epoch": 0.25620415638288146, "grad_norm": 0.6620925664901733, "learning_rate": 8.717108615243081e-06, "loss": 0.6722, "step": 1270 }, { "epoch": 0.25640589193908847, "grad_norm": 0.35290196537971497, "learning_rate": 8.714922290808766e-06, "loss": 0.6942, "step": 1271 }, { "epoch": 0.2566076274952955, "grad_norm": 0.644790768623352, "learning_rate": 8.712734379651075e-06, "loss": 0.7593, "step": 1272 }, { "epoch": 0.2568093630515025, "grad_norm": 0.7884232401847839, "learning_rate": 8.710544882704516e-06, "loss": 0.7974, "step": 1273 }, { "epoch": 0.25701109860770943, "grad_norm": 0.6848911046981812, "learning_rate": 8.708353800904269e-06, "loss": 0.7131, "step": 1274 }, { "epoch": 0.25721283416391644, "grad_norm": 0.5041124820709229, "learning_rate": 8.706161135186192e-06, "loss": 0.6846, "step": 1275 }, { "epoch": 0.25741456972012344, "grad_norm": 1.2871185541152954, "learning_rate": 8.703966886486819e-06, "loss": 0.6742, "step": 1276 }, { "epoch": 0.25761630527633045, "grad_norm": 0.5651084184646606, "learning_rate": 8.701771055743363e-06, "loss": 0.7076, "step": 1277 }, { "epoch": 0.2578180408325374, "grad_norm": 0.4107651710510254, "learning_rate": 8.699573643893708e-06, "loss": 0.6315, "step": 1278 }, { "epoch": 0.2580197763887444, "grad_norm": 0.48510411381721497, "learning_rate": 8.697374651876419e-06, "loss": 0.74, "step": 1279 }, { "epoch": 0.2582215119449514, "grad_norm": 0.36969929933547974, "learning_rate": 8.695174080630728e-06, "loss": 1.1805, "step": 1280 }, { "epoch": 0.2584232475011584, "grad_norm": 0.48439717292785645, "learning_rate": 8.692971931096553e-06, "loss": 0.7533, "step": 1281 }, { "epoch": 0.2586249830573654, "grad_norm": 0.5932062864303589, "learning_rate": 8.690768204214474e-06, "loss": 0.7201, "step": 1282 }, { "epoch": 0.25882671861357237, "grad_norm": 1.3188583850860596, "learning_rate": 8.688562900925755e-06, "loss": 0.6881, "step": 1283 }, { "epoch": 0.2590284541697794, "grad_norm": 0.40501800179481506, "learning_rate": 8.686356022172324e-06, "loss": 0.6485, "step": 1284 }, { "epoch": 0.2592301897259864, "grad_norm": 0.7536221146583557, "learning_rate": 8.684147568896788e-06, "loss": 0.6903, "step": 1285 }, { "epoch": 0.2594319252821934, "grad_norm": 0.5470438599586487, "learning_rate": 8.681937542042426e-06, "loss": 0.7256, "step": 1286 }, { "epoch": 0.25963366083840034, "grad_norm": 0.5705691576004028, "learning_rate": 8.679725942553189e-06, "loss": 0.6838, "step": 1287 }, { "epoch": 0.25983539639460734, "grad_norm": 0.7795629501342773, "learning_rate": 8.677512771373695e-06, "loss": 0.7627, "step": 1288 }, { "epoch": 0.26003713195081435, "grad_norm": 1.288813829421997, "learning_rate": 8.675298029449241e-06, "loss": 0.6883, "step": 1289 }, { "epoch": 0.26023886750702135, "grad_norm": 0.40749382972717285, "learning_rate": 8.67308171772579e-06, "loss": 0.7227, "step": 1290 }, { "epoch": 0.26044060306322836, "grad_norm": 1.280716061592102, "learning_rate": 8.670863837149976e-06, "loss": 0.8096, "step": 1291 }, { "epoch": 0.2606423386194353, "grad_norm": 0.424046128988266, "learning_rate": 8.668644388669102e-06, "loss": 0.7505, "step": 1292 }, { "epoch": 0.2608440741756423, "grad_norm": 0.8731921911239624, "learning_rate": 8.666423373231145e-06, "loss": 0.7711, "step": 1293 }, { "epoch": 0.2610458097318493, "grad_norm": 0.40730440616607666, "learning_rate": 8.664200791784746e-06, "loss": 0.7384, "step": 1294 }, { "epoch": 0.2612475452880563, "grad_norm": 0.45330461859703064, "learning_rate": 8.66197664527922e-06, "loss": 0.8328, "step": 1295 }, { "epoch": 0.2614492808442633, "grad_norm": 0.6191751956939697, "learning_rate": 8.659750934664546e-06, "loss": 0.7547, "step": 1296 }, { "epoch": 0.2616510164004703, "grad_norm": 0.5399580597877502, "learning_rate": 8.657523660891376e-06, "loss": 0.9473, "step": 1297 }, { "epoch": 0.2618527519566773, "grad_norm": 0.5975515246391296, "learning_rate": 8.655294824911022e-06, "loss": 0.6853, "step": 1298 }, { "epoch": 0.2620544875128843, "grad_norm": 0.5757783651351929, "learning_rate": 8.65306442767547e-06, "loss": 0.6934, "step": 1299 }, { "epoch": 0.2622562230690913, "grad_norm": 0.5642322301864624, "learning_rate": 8.650832470137373e-06, "loss": 0.6925, "step": 1300 }, { "epoch": 0.26245795862529825, "grad_norm": 0.3975115716457367, "learning_rate": 8.648598953250045e-06, "loss": 0.9906, "step": 1301 }, { "epoch": 0.26265969418150525, "grad_norm": 0.4235062599182129, "learning_rate": 8.64636387796747e-06, "loss": 0.8266, "step": 1302 }, { "epoch": 0.26286142973771226, "grad_norm": 0.4979153275489807, "learning_rate": 8.644127245244298e-06, "loss": 0.707, "step": 1303 }, { "epoch": 0.26306316529391927, "grad_norm": 0.8910357356071472, "learning_rate": 8.641889056035842e-06, "loss": 1.0055, "step": 1304 }, { "epoch": 0.2632649008501262, "grad_norm": 0.6762681603431702, "learning_rate": 8.639649311298081e-06, "loss": 0.948, "step": 1305 }, { "epoch": 0.2634666364063332, "grad_norm": 0.5388845801353455, "learning_rate": 8.637408011987657e-06, "loss": 0.6458, "step": 1306 }, { "epoch": 0.2636683719625402, "grad_norm": 0.4577954113483429, "learning_rate": 8.63516515906188e-06, "loss": 0.9071, "step": 1307 }, { "epoch": 0.26387010751874723, "grad_norm": 0.4139573276042938, "learning_rate": 8.63292075347872e-06, "loss": 0.6905, "step": 1308 }, { "epoch": 0.26407184307495424, "grad_norm": 1.0751546621322632, "learning_rate": 8.630674796196809e-06, "loss": 0.683, "step": 1309 }, { "epoch": 0.2642735786311612, "grad_norm": 2.5541248321533203, "learning_rate": 8.628427288175444e-06, "loss": 0.7364, "step": 1310 }, { "epoch": 0.2644753141873682, "grad_norm": 0.41766172647476196, "learning_rate": 8.626178230374588e-06, "loss": 0.7022, "step": 1311 }, { "epoch": 0.2646770497435752, "grad_norm": 0.4580974578857422, "learning_rate": 8.623927623754858e-06, "loss": 0.7594, "step": 1312 }, { "epoch": 0.2648787852997822, "grad_norm": 1.566453456878662, "learning_rate": 8.621675469277538e-06, "loss": 0.7054, "step": 1313 }, { "epoch": 0.26508052085598915, "grad_norm": 1.0086655616760254, "learning_rate": 8.619421767904571e-06, "loss": 0.6916, "step": 1314 }, { "epoch": 0.26528225641219616, "grad_norm": 0.5008938312530518, "learning_rate": 8.617166520598563e-06, "loss": 0.7232, "step": 1315 }, { "epoch": 0.26548399196840317, "grad_norm": 0.32938429713249207, "learning_rate": 8.614909728322778e-06, "loss": 0.8034, "step": 1316 }, { "epoch": 0.26568572752461017, "grad_norm": 0.5591095089912415, "learning_rate": 8.612651392041138e-06, "loss": 0.8449, "step": 1317 }, { "epoch": 0.2658874630808172, "grad_norm": 1.0034416913986206, "learning_rate": 8.610391512718232e-06, "loss": 0.6897, "step": 1318 }, { "epoch": 0.2660891986370241, "grad_norm": 0.5176849365234375, "learning_rate": 8.6081300913193e-06, "loss": 0.6888, "step": 1319 }, { "epoch": 0.26629093419323113, "grad_norm": 0.4472174346446991, "learning_rate": 8.605867128810243e-06, "loss": 0.9786, "step": 1320 }, { "epoch": 0.26649266974943814, "grad_norm": 0.39083603024482727, "learning_rate": 8.603602626157624e-06, "loss": 0.8769, "step": 1321 }, { "epoch": 0.26669440530564514, "grad_norm": 0.40016594529151917, "learning_rate": 8.601336584328659e-06, "loss": 0.8001, "step": 1322 }, { "epoch": 0.26689614086185215, "grad_norm": 0.42085206508636475, "learning_rate": 8.599069004291224e-06, "loss": 0.6967, "step": 1323 }, { "epoch": 0.2670978764180591, "grad_norm": 0.5440914630889893, "learning_rate": 8.596799887013852e-06, "loss": 1.0482, "step": 1324 }, { "epoch": 0.2672996119742661, "grad_norm": 0.900872528553009, "learning_rate": 8.594529233465728e-06, "loss": 0.7054, "step": 1325 }, { "epoch": 0.2675013475304731, "grad_norm": 0.3810652494430542, "learning_rate": 8.592257044616701e-06, "loss": 0.716, "step": 1326 }, { "epoch": 0.2677030830866801, "grad_norm": 0.5799583196640015, "learning_rate": 8.589983321437271e-06, "loss": 0.7983, "step": 1327 }, { "epoch": 0.26790481864288707, "grad_norm": 0.5827012658119202, "learning_rate": 8.587708064898595e-06, "loss": 0.709, "step": 1328 }, { "epoch": 0.26810655419909407, "grad_norm": 1.0978662967681885, "learning_rate": 8.585431275972483e-06, "loss": 0.6835, "step": 1329 }, { "epoch": 0.2683082897553011, "grad_norm": 0.4511360824108124, "learning_rate": 8.5831529556314e-06, "loss": 0.7022, "step": 1330 }, { "epoch": 0.2685100253115081, "grad_norm": 0.6490656137466431, "learning_rate": 8.580873104848466e-06, "loss": 0.7947, "step": 1331 }, { "epoch": 0.2687117608677151, "grad_norm": 0.6834023594856262, "learning_rate": 8.578591724597455e-06, "loss": 0.7441, "step": 1332 }, { "epoch": 0.26891349642392204, "grad_norm": 0.40996038913726807, "learning_rate": 8.576308815852793e-06, "loss": 0.669, "step": 1333 }, { "epoch": 0.26911523198012904, "grad_norm": 2.8881947994232178, "learning_rate": 8.57402437958956e-06, "loss": 0.7299, "step": 1334 }, { "epoch": 0.26931696753633605, "grad_norm": 2.5765886306762695, "learning_rate": 8.57173841678349e-06, "loss": 0.6747, "step": 1335 }, { "epoch": 0.26951870309254305, "grad_norm": 0.8167450428009033, "learning_rate": 8.569450928410963e-06, "loss": 0.7206, "step": 1336 }, { "epoch": 0.26972043864875, "grad_norm": 2.003079652786255, "learning_rate": 8.567161915449018e-06, "loss": 0.6806, "step": 1337 }, { "epoch": 0.269922174204957, "grad_norm": 2.5011754035949707, "learning_rate": 8.56487137887534e-06, "loss": 0.6929, "step": 1338 }, { "epoch": 0.270123909761164, "grad_norm": 0.7322660088539124, "learning_rate": 8.562579319668265e-06, "loss": 0.7239, "step": 1339 }, { "epoch": 0.270325645317371, "grad_norm": 0.47642382979393005, "learning_rate": 8.560285738806784e-06, "loss": 0.6749, "step": 1340 }, { "epoch": 0.270527380873578, "grad_norm": 0.5918588638305664, "learning_rate": 8.557990637270533e-06, "loss": 0.6878, "step": 1341 }, { "epoch": 0.270729116429785, "grad_norm": 0.4040104150772095, "learning_rate": 8.5556940160398e-06, "loss": 0.7198, "step": 1342 }, { "epoch": 0.270930851985992, "grad_norm": 0.516697883605957, "learning_rate": 8.553395876095523e-06, "loss": 0.769, "step": 1343 }, { "epoch": 0.271132587542199, "grad_norm": 1.1435527801513672, "learning_rate": 8.551096218419283e-06, "loss": 0.6739, "step": 1344 }, { "epoch": 0.271334323098406, "grad_norm": 0.4685196876525879, "learning_rate": 8.548795043993316e-06, "loss": 0.8076, "step": 1345 }, { "epoch": 0.27153605865461294, "grad_norm": 0.4881542921066284, "learning_rate": 8.546492353800504e-06, "loss": 0.659, "step": 1346 }, { "epoch": 0.27173779421081995, "grad_norm": 0.31347280740737915, "learning_rate": 8.544188148824376e-06, "loss": 1.4194, "step": 1347 }, { "epoch": 0.27193952976702696, "grad_norm": 0.5140544772148132, "learning_rate": 8.541882430049103e-06, "loss": 0.6736, "step": 1348 }, { "epoch": 0.27214126532323396, "grad_norm": 0.35953038930892944, "learning_rate": 8.539575198459512e-06, "loss": 0.6883, "step": 1349 }, { "epoch": 0.27234300087944097, "grad_norm": 0.5027438402175903, "learning_rate": 8.537266455041069e-06, "loss": 0.8255, "step": 1350 }, { "epoch": 0.2725447364356479, "grad_norm": 0.4743795096874237, "learning_rate": 8.534956200779889e-06, "loss": 0.7119, "step": 1351 }, { "epoch": 0.2727464719918549, "grad_norm": 0.5884958505630493, "learning_rate": 8.532644436662732e-06, "loss": 0.8181, "step": 1352 }, { "epoch": 0.2729482075480619, "grad_norm": 0.5815306901931763, "learning_rate": 8.530331163677e-06, "loss": 0.6907, "step": 1353 }, { "epoch": 0.27314994310426893, "grad_norm": 0.45392102003097534, "learning_rate": 8.528016382810744e-06, "loss": 0.8513, "step": 1354 }, { "epoch": 0.2733516786604759, "grad_norm": 0.4220741391181946, "learning_rate": 8.525700095052655e-06, "loss": 0.8672, "step": 1355 }, { "epoch": 0.2735534142166829, "grad_norm": 0.867940366268158, "learning_rate": 8.523382301392071e-06, "loss": 0.7781, "step": 1356 }, { "epoch": 0.2737551497728899, "grad_norm": 0.5932605862617493, "learning_rate": 8.52106300281897e-06, "loss": 0.8358, "step": 1357 }, { "epoch": 0.2739568853290969, "grad_norm": 0.5759530663490295, "learning_rate": 8.518742200323977e-06, "loss": 0.835, "step": 1358 }, { "epoch": 0.2741586208853039, "grad_norm": 0.37414512038230896, "learning_rate": 8.516419894898356e-06, "loss": 0.7588, "step": 1359 }, { "epoch": 0.27436035644151086, "grad_norm": 0.8124709725379944, "learning_rate": 8.51409608753401e-06, "loss": 0.7914, "step": 1360 }, { "epoch": 0.27456209199771786, "grad_norm": 0.7299355864524841, "learning_rate": 8.511770779223491e-06, "loss": 0.8218, "step": 1361 }, { "epoch": 0.27476382755392487, "grad_norm": 0.8825898766517639, "learning_rate": 8.50944397095999e-06, "loss": 0.6635, "step": 1362 }, { "epoch": 0.27496556311013187, "grad_norm": 0.5448117256164551, "learning_rate": 8.507115663737331e-06, "loss": 0.6729, "step": 1363 }, { "epoch": 0.2751672986663388, "grad_norm": 0.8231639266014099, "learning_rate": 8.504785858549989e-06, "loss": 0.7734, "step": 1364 }, { "epoch": 0.2753690342225458, "grad_norm": 1.6986454725265503, "learning_rate": 8.502454556393071e-06, "loss": 0.8105, "step": 1365 }, { "epoch": 0.27557076977875283, "grad_norm": 1.8813633918762207, "learning_rate": 8.50012175826233e-06, "loss": 0.8676, "step": 1366 }, { "epoch": 0.27577250533495984, "grad_norm": 0.4902789294719696, "learning_rate": 8.49778746515415e-06, "loss": 0.7333, "step": 1367 }, { "epoch": 0.27597424089116684, "grad_norm": 0.3125844895839691, "learning_rate": 8.495451678065563e-06, "loss": 0.822, "step": 1368 }, { "epoch": 0.2761759764473738, "grad_norm": 0.8880373239517212, "learning_rate": 8.493114397994229e-06, "loss": 0.7292, "step": 1369 }, { "epoch": 0.2763777120035808, "grad_norm": 0.3109457492828369, "learning_rate": 8.490775625938452e-06, "loss": 0.703, "step": 1370 }, { "epoch": 0.2765794475597878, "grad_norm": 0.5545622706413269, "learning_rate": 8.488435362897176e-06, "loss": 0.7946, "step": 1371 }, { "epoch": 0.2767811831159948, "grad_norm": 1.6414847373962402, "learning_rate": 8.486093609869972e-06, "loss": 0.7223, "step": 1372 }, { "epoch": 0.27698291867220176, "grad_norm": 0.4702077805995941, "learning_rate": 8.483750367857056e-06, "loss": 0.6668, "step": 1373 }, { "epoch": 0.27718465422840877, "grad_norm": 0.7001679539680481, "learning_rate": 8.481405637859277e-06, "loss": 0.7024, "step": 1374 }, { "epoch": 0.27738638978461577, "grad_norm": 0.6269954442977905, "learning_rate": 8.479059420878121e-06, "loss": 0.729, "step": 1375 }, { "epoch": 0.2775881253408228, "grad_norm": 0.39072608947753906, "learning_rate": 8.476711717915707e-06, "loss": 0.8695, "step": 1376 }, { "epoch": 0.2777898608970298, "grad_norm": 0.5733974575996399, "learning_rate": 8.474362529974787e-06, "loss": 0.6856, "step": 1377 }, { "epoch": 0.27799159645323673, "grad_norm": 0.47215527296066284, "learning_rate": 8.472011858058751e-06, "loss": 0.8618, "step": 1378 }, { "epoch": 0.27819333200944374, "grad_norm": 0.7275539040565491, "learning_rate": 8.469659703171624e-06, "loss": 0.849, "step": 1379 }, { "epoch": 0.27839506756565074, "grad_norm": 0.6656959652900696, "learning_rate": 8.467306066318063e-06, "loss": 0.7511, "step": 1380 }, { "epoch": 0.27859680312185775, "grad_norm": 0.7071893215179443, "learning_rate": 8.46495094850335e-06, "loss": 0.6808, "step": 1381 }, { "epoch": 0.2787985386780647, "grad_norm": 1.0060806274414062, "learning_rate": 8.462594350733414e-06, "loss": 0.6832, "step": 1382 }, { "epoch": 0.2790002742342717, "grad_norm": 0.5994428992271423, "learning_rate": 8.460236274014805e-06, "loss": 0.6658, "step": 1383 }, { "epoch": 0.2792020097904787, "grad_norm": 0.5965285897254944, "learning_rate": 8.457876719354708e-06, "loss": 0.661, "step": 1384 }, { "epoch": 0.2794037453466857, "grad_norm": 0.4691231846809387, "learning_rate": 8.455515687760943e-06, "loss": 0.8201, "step": 1385 }, { "epoch": 0.2796054809028927, "grad_norm": 0.7659153938293457, "learning_rate": 8.453153180241954e-06, "loss": 0.7907, "step": 1386 }, { "epoch": 0.2798072164590997, "grad_norm": 0.48467427492141724, "learning_rate": 8.450789197806819e-06, "loss": 0.8428, "step": 1387 }, { "epoch": 0.2800089520153067, "grad_norm": 0.800972044467926, "learning_rate": 8.448423741465249e-06, "loss": 0.7075, "step": 1388 }, { "epoch": 0.2802106875715137, "grad_norm": 0.5204843878746033, "learning_rate": 8.446056812227579e-06, "loss": 0.6352, "step": 1389 }, { "epoch": 0.2804124231277207, "grad_norm": 0.414917916059494, "learning_rate": 8.443688411104775e-06, "loss": 0.669, "step": 1390 }, { "epoch": 0.28061415868392764, "grad_norm": 0.316938579082489, "learning_rate": 8.441318539108433e-06, "loss": 0.7909, "step": 1391 }, { "epoch": 0.28081589424013464, "grad_norm": 0.5244396924972534, "learning_rate": 8.43894719725078e-06, "loss": 0.7192, "step": 1392 }, { "epoch": 0.28101762979634165, "grad_norm": 0.5841974020004272, "learning_rate": 8.43657438654466e-06, "loss": 0.7106, "step": 1393 }, { "epoch": 0.28121936535254866, "grad_norm": 0.41134485602378845, "learning_rate": 8.434200108003556e-06, "loss": 0.6971, "step": 1394 }, { "epoch": 0.28142110090875566, "grad_norm": 0.3490750789642334, "learning_rate": 8.431824362641573e-06, "loss": 0.6944, "step": 1395 }, { "epoch": 0.2816228364649626, "grad_norm": 0.5032677054405212, "learning_rate": 8.429447151473443e-06, "loss": 0.8536, "step": 1396 }, { "epoch": 0.2818245720211696, "grad_norm": 0.4385986924171448, "learning_rate": 8.427068475514524e-06, "loss": 0.7629, "step": 1397 }, { "epoch": 0.2820263075773766, "grad_norm": 0.779082179069519, "learning_rate": 8.424688335780799e-06, "loss": 0.6729, "step": 1398 }, { "epoch": 0.28222804313358363, "grad_norm": 0.8364329934120178, "learning_rate": 8.42230673328888e-06, "loss": 0.7799, "step": 1399 }, { "epoch": 0.2824297786897906, "grad_norm": 0.4209824502468109, "learning_rate": 8.419923669055995e-06, "loss": 0.7129, "step": 1400 }, { "epoch": 0.2826315142459976, "grad_norm": 0.4362705945968628, "learning_rate": 8.417539144100008e-06, "loss": 0.6584, "step": 1401 }, { "epoch": 0.2828332498022046, "grad_norm": 0.4876041114330292, "learning_rate": 8.415153159439397e-06, "loss": 0.7438, "step": 1402 }, { "epoch": 0.2830349853584116, "grad_norm": 0.513187825679779, "learning_rate": 8.412765716093273e-06, "loss": 0.6959, "step": 1403 }, { "epoch": 0.2832367209146186, "grad_norm": 0.4529809355735779, "learning_rate": 8.410376815081356e-06, "loss": 0.7872, "step": 1404 }, { "epoch": 0.28343845647082555, "grad_norm": 0.39496833086013794, "learning_rate": 8.407986457424002e-06, "loss": 0.7546, "step": 1405 }, { "epoch": 0.28364019202703256, "grad_norm": 14.25558853149414, "learning_rate": 8.405594644142186e-06, "loss": 0.83, "step": 1406 }, { "epoch": 0.28384192758323956, "grad_norm": 0.7426337003707886, "learning_rate": 8.4032013762575e-06, "loss": 0.6679, "step": 1407 }, { "epoch": 0.28404366313944657, "grad_norm": 2.217550039291382, "learning_rate": 8.400806654792161e-06, "loss": 0.7844, "step": 1408 }, { "epoch": 0.2842453986956535, "grad_norm": 2.4115359783172607, "learning_rate": 8.398410480769007e-06, "loss": 0.6876, "step": 1409 }, { "epoch": 0.2844471342518605, "grad_norm": 2.4654650688171387, "learning_rate": 8.396012855211494e-06, "loss": 0.7054, "step": 1410 }, { "epoch": 0.28464886980806753, "grad_norm": 0.7609624862670898, "learning_rate": 8.393613779143703e-06, "loss": 0.8621, "step": 1411 }, { "epoch": 0.28485060536427453, "grad_norm": 0.45478445291519165, "learning_rate": 8.391213253590325e-06, "loss": 0.7032, "step": 1412 }, { "epoch": 0.28505234092048154, "grad_norm": 0.6453161835670471, "learning_rate": 8.388811279576682e-06, "loss": 0.6764, "step": 1413 }, { "epoch": 0.2852540764766885, "grad_norm": 0.4519941210746765, "learning_rate": 8.386407858128707e-06, "loss": 0.6926, "step": 1414 }, { "epoch": 0.2854558120328955, "grad_norm": 1.1275166273117065, "learning_rate": 8.384002990272951e-06, "loss": 0.6854, "step": 1415 }, { "epoch": 0.2856575475891025, "grad_norm": 0.391759991645813, "learning_rate": 8.381596677036588e-06, "loss": 0.6812, "step": 1416 }, { "epoch": 0.2858592831453095, "grad_norm": 0.3690997064113617, "learning_rate": 8.379188919447405e-06, "loss": 0.6598, "step": 1417 }, { "epoch": 0.28606101870151646, "grad_norm": 0.6712028384208679, "learning_rate": 8.376779718533806e-06, "loss": 0.7243, "step": 1418 }, { "epoch": 0.28626275425772346, "grad_norm": 0.6204757690429688, "learning_rate": 8.374369075324813e-06, "loss": 0.7349, "step": 1419 }, { "epoch": 0.28646448981393047, "grad_norm": 0.6927712559700012, "learning_rate": 8.371956990850065e-06, "loss": 0.6993, "step": 1420 }, { "epoch": 0.2866662253701375, "grad_norm": 0.42022544145584106, "learning_rate": 8.369543466139816e-06, "loss": 0.7363, "step": 1421 }, { "epoch": 0.2868679609263445, "grad_norm": 0.6742234826087952, "learning_rate": 8.367128502224931e-06, "loss": 0.7005, "step": 1422 }, { "epoch": 0.28706969648255143, "grad_norm": 0.5271845459938049, "learning_rate": 8.364712100136897e-06, "loss": 0.7629, "step": 1423 }, { "epoch": 0.28727143203875843, "grad_norm": 0.4005318284034729, "learning_rate": 8.362294260907808e-06, "loss": 0.641, "step": 1424 }, { "epoch": 0.28747316759496544, "grad_norm": 1.1153535842895508, "learning_rate": 8.359874985570378e-06, "loss": 0.8169, "step": 1425 }, { "epoch": 0.28767490315117245, "grad_norm": 1.0508934259414673, "learning_rate": 8.35745427515793e-06, "loss": 0.7177, "step": 1426 }, { "epoch": 0.28787663870737945, "grad_norm": 0.40963393449783325, "learning_rate": 8.355032130704402e-06, "loss": 0.6816, "step": 1427 }, { "epoch": 0.2880783742635864, "grad_norm": 0.8009536266326904, "learning_rate": 8.352608553244344e-06, "loss": 0.72, "step": 1428 }, { "epoch": 0.2882801098197934, "grad_norm": 0.8653877973556519, "learning_rate": 8.350183543812918e-06, "loss": 0.6564, "step": 1429 }, { "epoch": 0.2884818453760004, "grad_norm": 0.4677664339542389, "learning_rate": 8.3477571034459e-06, "loss": 0.712, "step": 1430 }, { "epoch": 0.2886835809322074, "grad_norm": 1.347430944442749, "learning_rate": 8.34532923317967e-06, "loss": 0.6571, "step": 1431 }, { "epoch": 0.28888531648841437, "grad_norm": 0.5276641845703125, "learning_rate": 8.342899934051229e-06, "loss": 0.7271, "step": 1432 }, { "epoch": 0.2890870520446214, "grad_norm": 1.0363144874572754, "learning_rate": 8.34046920709818e-06, "loss": 0.9796, "step": 1433 }, { "epoch": 0.2892887876008284, "grad_norm": 0.7945374846458435, "learning_rate": 8.338037053358739e-06, "loss": 0.6992, "step": 1434 }, { "epoch": 0.2894905231570354, "grad_norm": 0.7973695397377014, "learning_rate": 8.335603473871734e-06, "loss": 0.8969, "step": 1435 }, { "epoch": 0.2896922587132424, "grad_norm": 0.40596818923950195, "learning_rate": 8.333168469676595e-06, "loss": 0.704, "step": 1436 }, { "epoch": 0.28989399426944934, "grad_norm": 0.43208691477775574, "learning_rate": 8.330732041813367e-06, "loss": 0.7658, "step": 1437 }, { "epoch": 0.29009572982565635, "grad_norm": 0.9563391804695129, "learning_rate": 8.328294191322703e-06, "loss": 0.7315, "step": 1438 }, { "epoch": 0.29029746538186335, "grad_norm": 0.36794593930244446, "learning_rate": 8.325854919245859e-06, "loss": 0.8485, "step": 1439 }, { "epoch": 0.29049920093807036, "grad_norm": 0.3880400061607361, "learning_rate": 8.323414226624699e-06, "loss": 0.6999, "step": 1440 }, { "epoch": 0.2907009364942773, "grad_norm": 0.68841952085495, "learning_rate": 8.320972114501698e-06, "loss": 0.9734, "step": 1441 }, { "epoch": 0.2909026720504843, "grad_norm": 0.5601948499679565, "learning_rate": 8.318528583919933e-06, "loss": 0.8702, "step": 1442 }, { "epoch": 0.2911044076066913, "grad_norm": 0.442568302154541, "learning_rate": 8.31608363592309e-06, "loss": 0.6594, "step": 1443 }, { "epoch": 0.2913061431628983, "grad_norm": 0.7428321838378906, "learning_rate": 8.313637271555462e-06, "loss": 0.6561, "step": 1444 }, { "epoch": 0.29150787871910533, "grad_norm": 2.2169814109802246, "learning_rate": 8.311189491861938e-06, "loss": 0.669, "step": 1445 }, { "epoch": 0.2917096142753123, "grad_norm": 2.1682381629943848, "learning_rate": 8.30874029788802e-06, "loss": 0.791, "step": 1446 }, { "epoch": 0.2919113498315193, "grad_norm": 0.6268622875213623, "learning_rate": 8.306289690679812e-06, "loss": 0.7264, "step": 1447 }, { "epoch": 0.2921130853877263, "grad_norm": 0.757176399230957, "learning_rate": 8.30383767128402e-06, "loss": 0.7165, "step": 1448 }, { "epoch": 0.2923148209439333, "grad_norm": 0.4244846999645233, "learning_rate": 8.301384240747957e-06, "loss": 0.7516, "step": 1449 }, { "epoch": 0.29251655650014025, "grad_norm": 1.070397138595581, "learning_rate": 8.298929400119533e-06, "loss": 0.6561, "step": 1450 }, { "epoch": 0.29271829205634725, "grad_norm": 0.5053624510765076, "learning_rate": 8.296473150447263e-06, "loss": 0.6578, "step": 1451 }, { "epoch": 0.29292002761255426, "grad_norm": 0.4902132451534271, "learning_rate": 8.294015492780267e-06, "loss": 0.7171, "step": 1452 }, { "epoch": 0.29312176316876126, "grad_norm": 0.3744256794452667, "learning_rate": 8.291556428168263e-06, "loss": 0.6644, "step": 1453 }, { "epoch": 0.29332349872496827, "grad_norm": 0.6036210060119629, "learning_rate": 8.289095957661569e-06, "loss": 0.8321, "step": 1454 }, { "epoch": 0.2935252342811752, "grad_norm": 0.6904403567314148, "learning_rate": 8.286634082311107e-06, "loss": 0.6942, "step": 1455 }, { "epoch": 0.2937269698373822, "grad_norm": 0.5230683088302612, "learning_rate": 8.284170803168393e-06, "loss": 0.6435, "step": 1456 }, { "epoch": 0.29392870539358923, "grad_norm": 0.3730091154575348, "learning_rate": 8.28170612128555e-06, "loss": 0.6557, "step": 1457 }, { "epoch": 0.29413044094979623, "grad_norm": 0.44985833764076233, "learning_rate": 8.279240037715297e-06, "loss": 0.7341, "step": 1458 }, { "epoch": 0.2943321765060032, "grad_norm": 0.4488007724285126, "learning_rate": 8.27677255351095e-06, "loss": 0.7637, "step": 1459 }, { "epoch": 0.2945339120622102, "grad_norm": 4.375908374786377, "learning_rate": 8.274303669726427e-06, "loss": 0.8729, "step": 1460 }, { "epoch": 0.2947356476184172, "grad_norm": 0.7124999165534973, "learning_rate": 8.271833387416237e-06, "loss": 0.6708, "step": 1461 }, { "epoch": 0.2949373831746242, "grad_norm": 0.5048505663871765, "learning_rate": 8.269361707635494e-06, "loss": 0.6511, "step": 1462 }, { "epoch": 0.2951391187308312, "grad_norm": 0.9429525136947632, "learning_rate": 8.266888631439907e-06, "loss": 0.8462, "step": 1463 }, { "epoch": 0.29534085428703816, "grad_norm": 0.49526622891426086, "learning_rate": 8.264414159885776e-06, "loss": 0.7101, "step": 1464 }, { "epoch": 0.29554258984324516, "grad_norm": 0.623329758644104, "learning_rate": 8.261938294030003e-06, "loss": 0.6689, "step": 1465 }, { "epoch": 0.29574432539945217, "grad_norm": 1.0023547410964966, "learning_rate": 8.259461034930088e-06, "loss": 0.7473, "step": 1466 }, { "epoch": 0.2959460609556592, "grad_norm": 0.555940568447113, "learning_rate": 8.256982383644114e-06, "loss": 0.6968, "step": 1467 }, { "epoch": 0.2961477965118661, "grad_norm": 2.2776830196380615, "learning_rate": 8.254502341230771e-06, "loss": 0.76, "step": 1468 }, { "epoch": 0.29634953206807313, "grad_norm": 2.1908066272735596, "learning_rate": 8.252020908749338e-06, "loss": 0.7083, "step": 1469 }, { "epoch": 0.29655126762428013, "grad_norm": 2.6084344387054443, "learning_rate": 8.24953808725969e-06, "loss": 0.6853, "step": 1470 }, { "epoch": 0.29675300318048714, "grad_norm": 2.2120113372802734, "learning_rate": 8.24705387782229e-06, "loss": 0.6915, "step": 1471 }, { "epoch": 0.29695473873669415, "grad_norm": 1.1153708696365356, "learning_rate": 8.244568281498198e-06, "loss": 0.6737, "step": 1472 }, { "epoch": 0.2971564742929011, "grad_norm": 1.088796615600586, "learning_rate": 8.24208129934907e-06, "loss": 0.6738, "step": 1473 }, { "epoch": 0.2973582098491081, "grad_norm": 0.6631084680557251, "learning_rate": 8.239592932437144e-06, "loss": 0.6507, "step": 1474 }, { "epoch": 0.2975599454053151, "grad_norm": 0.6017811298370361, "learning_rate": 8.237103181825257e-06, "loss": 0.9657, "step": 1475 }, { "epoch": 0.2977616809615221, "grad_norm": 0.7511520385742188, "learning_rate": 8.234612048576838e-06, "loss": 0.629, "step": 1476 }, { "epoch": 0.29796341651772906, "grad_norm": 0.593137800693512, "learning_rate": 8.2321195337559e-06, "loss": 0.6513, "step": 1477 }, { "epoch": 0.29816515207393607, "grad_norm": 0.5652965307235718, "learning_rate": 8.229625638427052e-06, "loss": 0.6914, "step": 1478 }, { "epoch": 0.2983668876301431, "grad_norm": 0.4325360357761383, "learning_rate": 8.22713036365549e-06, "loss": 0.6658, "step": 1479 }, { "epoch": 0.2985686231863501, "grad_norm": 0.382667601108551, "learning_rate": 8.224633710506997e-06, "loss": 0.8343, "step": 1480 }, { "epoch": 0.2987703587425571, "grad_norm": 2.504685640335083, "learning_rate": 8.222135680047952e-06, "loss": 0.7681, "step": 1481 }, { "epoch": 0.29897209429876404, "grad_norm": 0.8869113922119141, "learning_rate": 8.219636273345315e-06, "loss": 0.6972, "step": 1482 }, { "epoch": 0.29917382985497104, "grad_norm": 0.6380051374435425, "learning_rate": 8.217135491466636e-06, "loss": 0.6778, "step": 1483 }, { "epoch": 0.29937556541117805, "grad_norm": 0.4614868462085724, "learning_rate": 8.214633335480055e-06, "loss": 0.7061, "step": 1484 }, { "epoch": 0.29957730096738505, "grad_norm": 0.5545341968536377, "learning_rate": 8.212129806454294e-06, "loss": 0.7063, "step": 1485 }, { "epoch": 0.299779036523592, "grad_norm": 0.4980104863643646, "learning_rate": 8.209624905458667e-06, "loss": 0.65, "step": 1486 }, { "epoch": 0.299980772079799, "grad_norm": 0.619178831577301, "learning_rate": 8.20711863356307e-06, "loss": 0.7838, "step": 1487 }, { "epoch": 0.300182507636006, "grad_norm": 0.4087018072605133, "learning_rate": 8.204610991837983e-06, "loss": 0.7019, "step": 1488 }, { "epoch": 0.300384243192213, "grad_norm": 0.39627236127853394, "learning_rate": 8.202101981354478e-06, "loss": 0.6936, "step": 1489 }, { "epoch": 0.30058597874842, "grad_norm": 1.5058199167251587, "learning_rate": 8.199591603184205e-06, "loss": 0.8135, "step": 1490 }, { "epoch": 0.300787714304627, "grad_norm": 0.5494144558906555, "learning_rate": 8.197079858399403e-06, "loss": 0.8652, "step": 1491 }, { "epoch": 0.300989449860834, "grad_norm": 0.7970489263534546, "learning_rate": 8.19456674807289e-06, "loss": 0.7358, "step": 1492 }, { "epoch": 0.301191185417041, "grad_norm": 0.7870729565620422, "learning_rate": 8.19205227327807e-06, "loss": 0.773, "step": 1493 }, { "epoch": 0.301392920973248, "grad_norm": 1.1276938915252686, "learning_rate": 8.189536435088931e-06, "loss": 0.6871, "step": 1494 }, { "epoch": 0.30159465652945494, "grad_norm": 4.686948776245117, "learning_rate": 8.18701923458004e-06, "loss": 0.6465, "step": 1495 }, { "epoch": 0.30179639208566195, "grad_norm": 2.210895538330078, "learning_rate": 8.184500672826547e-06, "loss": 0.6671, "step": 1496 }, { "epoch": 0.30199812764186895, "grad_norm": 3.591681957244873, "learning_rate": 8.181980750904185e-06, "loss": 0.6889, "step": 1497 }, { "epoch": 0.30219986319807596, "grad_norm": 2.6738879680633545, "learning_rate": 8.179459469889269e-06, "loss": 0.6789, "step": 1498 }, { "epoch": 0.30240159875428296, "grad_norm": 1.4650682210922241, "learning_rate": 8.176936830858689e-06, "loss": 0.7287, "step": 1499 }, { "epoch": 0.3026033343104899, "grad_norm": 2.2726752758026123, "learning_rate": 8.17441283488992e-06, "loss": 0.6666, "step": 1500 }, { "epoch": 0.3028050698666969, "grad_norm": 0.5218955278396606, "learning_rate": 8.171887483061014e-06, "loss": 0.7004, "step": 1501 }, { "epoch": 0.3030068054229039, "grad_norm": 0.559872031211853, "learning_rate": 8.169360776450606e-06, "loss": 0.6717, "step": 1502 }, { "epoch": 0.30320854097911093, "grad_norm": 0.760443389415741, "learning_rate": 8.166832716137905e-06, "loss": 0.7983, "step": 1503 }, { "epoch": 0.3034102765353179, "grad_norm": 0.8818268775939941, "learning_rate": 8.164303303202698e-06, "loss": 0.7724, "step": 1504 }, { "epoch": 0.3036120120915249, "grad_norm": 0.5333113670349121, "learning_rate": 8.161772538725357e-06, "loss": 0.6922, "step": 1505 }, { "epoch": 0.3038137476477319, "grad_norm": 0.5082593560218811, "learning_rate": 8.15924042378682e-06, "loss": 0.6744, "step": 1506 }, { "epoch": 0.3040154832039389, "grad_norm": 0.5612021684646606, "learning_rate": 8.156706959468611e-06, "loss": 0.763, "step": 1507 }, { "epoch": 0.3042172187601459, "grad_norm": 0.4409393072128296, "learning_rate": 8.15417214685283e-06, "loss": 0.7668, "step": 1508 }, { "epoch": 0.30441895431635285, "grad_norm": 0.7158373594284058, "learning_rate": 8.151635987022146e-06, "loss": 0.788, "step": 1509 }, { "epoch": 0.30462068987255986, "grad_norm": 0.7124815583229065, "learning_rate": 8.149098481059807e-06, "loss": 0.7019, "step": 1510 }, { "epoch": 0.30482242542876686, "grad_norm": 0.5579118132591248, "learning_rate": 8.146559630049639e-06, "loss": 0.7112, "step": 1511 }, { "epoch": 0.30502416098497387, "grad_norm": 0.4145357310771942, "learning_rate": 8.14401943507604e-06, "loss": 0.7051, "step": 1512 }, { "epoch": 0.3052258965411808, "grad_norm": 0.43273279070854187, "learning_rate": 8.14147789722398e-06, "loss": 0.8772, "step": 1513 }, { "epoch": 0.3054276320973878, "grad_norm": 0.4729679226875305, "learning_rate": 8.138935017579007e-06, "loss": 0.6842, "step": 1514 }, { "epoch": 0.30562936765359483, "grad_norm": 3.5257089138031006, "learning_rate": 8.136390797227235e-06, "loss": 0.7959, "step": 1515 }, { "epoch": 0.30583110320980184, "grad_norm": 0.608936071395874, "learning_rate": 8.133845237255361e-06, "loss": 0.679, "step": 1516 }, { "epoch": 0.30603283876600884, "grad_norm": 0.5570759773254395, "learning_rate": 8.131298338750648e-06, "loss": 0.6549, "step": 1517 }, { "epoch": 0.3062345743222158, "grad_norm": 0.5858903527259827, "learning_rate": 8.128750102800929e-06, "loss": 0.8579, "step": 1518 }, { "epoch": 0.3064363098784228, "grad_norm": 0.6043578386306763, "learning_rate": 8.12620053049461e-06, "loss": 0.8372, "step": 1519 }, { "epoch": 0.3066380454346298, "grad_norm": 0.4177265465259552, "learning_rate": 8.12364962292067e-06, "loss": 0.683, "step": 1520 }, { "epoch": 0.3068397809908368, "grad_norm": 0.5823288559913635, "learning_rate": 8.121097381168654e-06, "loss": 0.6338, "step": 1521 }, { "epoch": 0.3070415165470438, "grad_norm": 1.1234952211380005, "learning_rate": 8.118543806328682e-06, "loss": 0.6671, "step": 1522 }, { "epoch": 0.30724325210325076, "grad_norm": 0.36531540751457214, "learning_rate": 8.11598889949144e-06, "loss": 0.723, "step": 1523 }, { "epoch": 0.30744498765945777, "grad_norm": 0.8904269337654114, "learning_rate": 8.113432661748187e-06, "loss": 0.6854, "step": 1524 }, { "epoch": 0.3076467232156648, "grad_norm": 0.38234153389930725, "learning_rate": 8.110875094190742e-06, "loss": 0.7636, "step": 1525 }, { "epoch": 0.3078484587718718, "grad_norm": 1.4962918758392334, "learning_rate": 8.108316197911498e-06, "loss": 0.8294, "step": 1526 }, { "epoch": 0.30805019432807873, "grad_norm": 4.313572883605957, "learning_rate": 8.105755974003418e-06, "loss": 0.6851, "step": 1527 }, { "epoch": 0.30825192988428574, "grad_norm": 0.4833182394504547, "learning_rate": 8.103194423560026e-06, "loss": 0.7248, "step": 1528 }, { "epoch": 0.30845366544049274, "grad_norm": 0.40504032373428345, "learning_rate": 8.100631547675417e-06, "loss": 0.6765, "step": 1529 }, { "epoch": 0.30865540099669975, "grad_norm": 0.66014564037323, "learning_rate": 8.09806734744425e-06, "loss": 0.6544, "step": 1530 }, { "epoch": 0.30885713655290675, "grad_norm": 0.6813423037528992, "learning_rate": 8.095501823961752e-06, "loss": 0.679, "step": 1531 }, { "epoch": 0.3090588721091137, "grad_norm": 0.6496931314468384, "learning_rate": 8.092934978323708e-06, "loss": 0.8407, "step": 1532 }, { "epoch": 0.3092606076653207, "grad_norm": 0.7875673174858093, "learning_rate": 8.090366811626477e-06, "loss": 0.8551, "step": 1533 }, { "epoch": 0.3094623432215277, "grad_norm": 0.4039202034473419, "learning_rate": 8.087797324966981e-06, "loss": 0.8194, "step": 1534 }, { "epoch": 0.3096640787777347, "grad_norm": 0.4680841565132141, "learning_rate": 8.085226519442697e-06, "loss": 0.6882, "step": 1535 }, { "epoch": 0.30986581433394167, "grad_norm": 0.37298035621643066, "learning_rate": 8.082654396151676e-06, "loss": 0.6403, "step": 1536 }, { "epoch": 0.3100675498901487, "grad_norm": 0.43404385447502136, "learning_rate": 8.080080956192525e-06, "loss": 0.7122, "step": 1537 }, { "epoch": 0.3102692854463557, "grad_norm": 0.4201224148273468, "learning_rate": 8.077506200664416e-06, "loss": 0.8081, "step": 1538 }, { "epoch": 0.3104710210025627, "grad_norm": 0.8215480446815491, "learning_rate": 8.074930130667085e-06, "loss": 0.6545, "step": 1539 }, { "epoch": 0.3106727565587697, "grad_norm": 0.8744224309921265, "learning_rate": 8.072352747300823e-06, "loss": 0.6507, "step": 1540 }, { "epoch": 0.31087449211497664, "grad_norm": 0.9224565625190735, "learning_rate": 8.06977405166649e-06, "loss": 0.6646, "step": 1541 }, { "epoch": 0.31107622767118365, "grad_norm": 0.9008786678314209, "learning_rate": 8.067194044865499e-06, "loss": 0.6832, "step": 1542 }, { "epoch": 0.31127796322739065, "grad_norm": 0.5391042828559875, "learning_rate": 8.064612727999827e-06, "loss": 0.683, "step": 1543 }, { "epoch": 0.31147969878359766, "grad_norm": 0.7307866811752319, "learning_rate": 8.062030102172013e-06, "loss": 0.6636, "step": 1544 }, { "epoch": 0.3116814343398046, "grad_norm": 1.0967941284179688, "learning_rate": 8.05944616848515e-06, "loss": 0.7132, "step": 1545 }, { "epoch": 0.3118831698960116, "grad_norm": 0.661615788936615, "learning_rate": 8.056860928042892e-06, "loss": 0.8316, "step": 1546 }, { "epoch": 0.3120849054522186, "grad_norm": 0.5277570486068726, "learning_rate": 8.054274381949449e-06, "loss": 0.7624, "step": 1547 }, { "epoch": 0.3122866410084256, "grad_norm": 0.4380006790161133, "learning_rate": 8.051686531309595e-06, "loss": 0.7945, "step": 1548 }, { "epoch": 0.31248837656463263, "grad_norm": 0.48799121379852295, "learning_rate": 8.049097377228653e-06, "loss": 0.7093, "step": 1549 }, { "epoch": 0.3126901121208396, "grad_norm": 0.4029648005962372, "learning_rate": 8.046506920812508e-06, "loss": 0.9447, "step": 1550 }, { "epoch": 0.3128918476770466, "grad_norm": 0.8204885125160217, "learning_rate": 8.0439151631676e-06, "loss": 0.8118, "step": 1551 }, { "epoch": 0.3130935832332536, "grad_norm": 2.713876962661743, "learning_rate": 8.041322105400923e-06, "loss": 0.7025, "step": 1552 }, { "epoch": 0.3132953187894606, "grad_norm": 0.3328094482421875, "learning_rate": 8.03872774862003e-06, "loss": 0.7096, "step": 1553 }, { "epoch": 0.31349705434566755, "grad_norm": 0.75003582239151, "learning_rate": 8.036132093933025e-06, "loss": 0.7072, "step": 1554 }, { "epoch": 0.31369878990187455, "grad_norm": 1.3351184129714966, "learning_rate": 8.03353514244857e-06, "loss": 0.7103, "step": 1555 }, { "epoch": 0.31390052545808156, "grad_norm": 0.38253384828567505, "learning_rate": 8.030936895275875e-06, "loss": 0.6638, "step": 1556 }, { "epoch": 0.31410226101428856, "grad_norm": 0.43413951992988586, "learning_rate": 8.028337353524712e-06, "loss": 0.8378, "step": 1557 }, { "epoch": 0.31430399657049557, "grad_norm": 0.5159037709236145, "learning_rate": 8.025736518305398e-06, "loss": 0.7017, "step": 1558 }, { "epoch": 0.3145057321267025, "grad_norm": 0.9960720539093018, "learning_rate": 8.023134390728808e-06, "loss": 0.6939, "step": 1559 }, { "epoch": 0.3147074676829095, "grad_norm": 0.8572008609771729, "learning_rate": 8.020530971906365e-06, "loss": 0.6778, "step": 1560 }, { "epoch": 0.31490920323911653, "grad_norm": 0.5154844522476196, "learning_rate": 8.017926262950048e-06, "loss": 0.7486, "step": 1561 }, { "epoch": 0.31511093879532354, "grad_norm": 0.722102701663971, "learning_rate": 8.015320264972381e-06, "loss": 0.691, "step": 1562 }, { "epoch": 0.3153126743515305, "grad_norm": 0.7588250041007996, "learning_rate": 8.012712979086444e-06, "loss": 0.6785, "step": 1563 }, { "epoch": 0.3155144099077375, "grad_norm": 1.2562438249588013, "learning_rate": 8.010104406405865e-06, "loss": 0.6535, "step": 1564 }, { "epoch": 0.3157161454639445, "grad_norm": 0.39819613099098206, "learning_rate": 8.00749454804482e-06, "loss": 0.6738, "step": 1565 }, { "epoch": 0.3159178810201515, "grad_norm": 1.0211012363433838, "learning_rate": 8.004883405118036e-06, "loss": 0.6788, "step": 1566 }, { "epoch": 0.3161196165763585, "grad_norm": 0.6734877824783325, "learning_rate": 8.00227097874079e-06, "loss": 0.7398, "step": 1567 }, { "epoch": 0.31632135213256546, "grad_norm": 0.4316209554672241, "learning_rate": 7.999657270028904e-06, "loss": 0.7092, "step": 1568 }, { "epoch": 0.31652308768877246, "grad_norm": 0.39186379313468933, "learning_rate": 7.997042280098752e-06, "loss": 0.902, "step": 1569 }, { "epoch": 0.31672482324497947, "grad_norm": 0.9745991826057434, "learning_rate": 7.99442601006725e-06, "loss": 0.6493, "step": 1570 }, { "epoch": 0.3169265588011865, "grad_norm": 0.48621875047683716, "learning_rate": 7.991808461051862e-06, "loss": 0.6901, "step": 1571 }, { "epoch": 0.3171282943573934, "grad_norm": 0.6053562164306641, "learning_rate": 7.989189634170603e-06, "loss": 0.8829, "step": 1572 }, { "epoch": 0.31733002991360043, "grad_norm": 1.5087822675704956, "learning_rate": 7.986569530542028e-06, "loss": 0.6407, "step": 1573 }, { "epoch": 0.31753176546980744, "grad_norm": 0.5080868601799011, "learning_rate": 7.983948151285242e-06, "loss": 0.6927, "step": 1574 }, { "epoch": 0.31773350102601444, "grad_norm": 0.3664741516113281, "learning_rate": 7.981325497519892e-06, "loss": 0.6811, "step": 1575 }, { "epoch": 0.31793523658222145, "grad_norm": 0.9207780957221985, "learning_rate": 7.978701570366167e-06, "loss": 0.7989, "step": 1576 }, { "epoch": 0.3181369721384284, "grad_norm": 0.4849698543548584, "learning_rate": 7.976076370944805e-06, "loss": 0.6831, "step": 1577 }, { "epoch": 0.3183387076946354, "grad_norm": 0.33219724893569946, "learning_rate": 7.973449900377086e-06, "loss": 0.7534, "step": 1578 }, { "epoch": 0.3185404432508424, "grad_norm": 0.77428138256073, "learning_rate": 7.970822159784832e-06, "loss": 0.7303, "step": 1579 }, { "epoch": 0.3187421788070494, "grad_norm": 0.3515469431877136, "learning_rate": 7.968193150290408e-06, "loss": 0.6644, "step": 1580 }, { "epoch": 0.31894391436325636, "grad_norm": 0.4491312801837921, "learning_rate": 7.96556287301672e-06, "loss": 0.6831, "step": 1581 }, { "epoch": 0.31914564991946337, "grad_norm": 0.6044942140579224, "learning_rate": 7.962931329087214e-06, "loss": 0.6645, "step": 1582 }, { "epoch": 0.3193473854756704, "grad_norm": 0.6661559343338013, "learning_rate": 7.96029851962588e-06, "loss": 0.6956, "step": 1583 }, { "epoch": 0.3195491210318774, "grad_norm": 0.54176926612854, "learning_rate": 7.95766444575725e-06, "loss": 0.7459, "step": 1584 }, { "epoch": 0.3197508565880844, "grad_norm": 0.3712012767791748, "learning_rate": 7.955029108606392e-06, "loss": 0.968, "step": 1585 }, { "epoch": 0.31995259214429134, "grad_norm": 0.4860132932662964, "learning_rate": 7.952392509298916e-06, "loss": 0.9221, "step": 1586 }, { "epoch": 0.32015432770049834, "grad_norm": 0.41063064336776733, "learning_rate": 7.94975464896097e-06, "loss": 0.6869, "step": 1587 }, { "epoch": 0.32035606325670535, "grad_norm": 0.46989545226097107, "learning_rate": 7.947115528719241e-06, "loss": 0.7325, "step": 1588 }, { "epoch": 0.32055779881291235, "grad_norm": 0.613256573677063, "learning_rate": 7.944475149700954e-06, "loss": 0.6995, "step": 1589 }, { "epoch": 0.3207595343691193, "grad_norm": 0.7676635384559631, "learning_rate": 7.941833513033873e-06, "loss": 0.662, "step": 1590 }, { "epoch": 0.3209612699253263, "grad_norm": 0.3710860311985016, "learning_rate": 7.939190619846296e-06, "loss": 0.6619, "step": 1591 }, { "epoch": 0.3211630054815333, "grad_norm": 0.41502392292022705, "learning_rate": 7.93654647126706e-06, "loss": 0.6537, "step": 1592 }, { "epoch": 0.3213647410377403, "grad_norm": 0.5116865038871765, "learning_rate": 7.933901068425539e-06, "loss": 0.6757, "step": 1593 }, { "epoch": 0.3215664765939473, "grad_norm": 0.761396586894989, "learning_rate": 7.93125441245164e-06, "loss": 0.6889, "step": 1594 }, { "epoch": 0.3217682121501543, "grad_norm": 0.8247814178466797, "learning_rate": 7.928606504475809e-06, "loss": 0.7393, "step": 1595 }, { "epoch": 0.3219699477063613, "grad_norm": 0.5235460996627808, "learning_rate": 7.925957345629023e-06, "loss": 0.7118, "step": 1596 }, { "epoch": 0.3221716832625683, "grad_norm": 0.8428860902786255, "learning_rate": 7.923306937042796e-06, "loss": 0.6925, "step": 1597 }, { "epoch": 0.3223734188187753, "grad_norm": 0.7909860014915466, "learning_rate": 7.920655279849173e-06, "loss": 0.6794, "step": 1598 }, { "epoch": 0.32257515437498224, "grad_norm": 0.5924726724624634, "learning_rate": 7.918002375180733e-06, "loss": 0.6811, "step": 1599 }, { "epoch": 0.32277688993118925, "grad_norm": 0.8714628219604492, "learning_rate": 7.915348224170593e-06, "loss": 0.7976, "step": 1600 }, { "epoch": 0.32297862548739625, "grad_norm": 0.34327223896980286, "learning_rate": 7.912692827952395e-06, "loss": 0.7037, "step": 1601 }, { "epoch": 0.32318036104360326, "grad_norm": 0.3930150866508484, "learning_rate": 7.910036187660316e-06, "loss": 0.647, "step": 1602 }, { "epoch": 0.32338209659981026, "grad_norm": 0.39237427711486816, "learning_rate": 7.907378304429065e-06, "loss": 0.66, "step": 1603 }, { "epoch": 0.3235838321560172, "grad_norm": 0.7589460015296936, "learning_rate": 7.904719179393881e-06, "loss": 0.6372, "step": 1604 }, { "epoch": 0.3237855677122242, "grad_norm": 1.4176976680755615, "learning_rate": 7.902058813690532e-06, "loss": 0.6219, "step": 1605 }, { "epoch": 0.3239873032684312, "grad_norm": 0.6004815101623535, "learning_rate": 7.899397208455323e-06, "loss": 0.677, "step": 1606 }, { "epoch": 0.32418903882463823, "grad_norm": 1.2083632946014404, "learning_rate": 7.896734364825076e-06, "loss": 0.7007, "step": 1607 }, { "epoch": 0.3243907743808452, "grad_norm": 0.7155832648277283, "learning_rate": 7.894070283937152e-06, "loss": 0.6921, "step": 1608 }, { "epoch": 0.3245925099370522, "grad_norm": 0.3577544391155243, "learning_rate": 7.891404966929439e-06, "loss": 0.6632, "step": 1609 }, { "epoch": 0.3247942454932592, "grad_norm": 0.49218058586120605, "learning_rate": 7.888738414940352e-06, "loss": 0.6419, "step": 1610 }, { "epoch": 0.3249959810494662, "grad_norm": 0.4804253578186035, "learning_rate": 7.886070629108826e-06, "loss": 0.904, "step": 1611 }, { "epoch": 0.3251977166056732, "grad_norm": 0.36306464672088623, "learning_rate": 7.883401610574338e-06, "loss": 0.812, "step": 1612 }, { "epoch": 0.32539945216188015, "grad_norm": 0.41663479804992676, "learning_rate": 7.880731360476877e-06, "loss": 0.9999, "step": 1613 }, { "epoch": 0.32560118771808716, "grad_norm": 0.42178666591644287, "learning_rate": 7.878059879956967e-06, "loss": 0.8646, "step": 1614 }, { "epoch": 0.32580292327429416, "grad_norm": 0.4636440575122833, "learning_rate": 7.875387170155657e-06, "loss": 0.7127, "step": 1615 }, { "epoch": 0.32600465883050117, "grad_norm": 0.4064319431781769, "learning_rate": 7.872713232214517e-06, "loss": 0.8338, "step": 1616 }, { "epoch": 0.3262063943867082, "grad_norm": 0.47023361921310425, "learning_rate": 7.87003806727564e-06, "loss": 0.6782, "step": 1617 }, { "epoch": 0.3264081299429151, "grad_norm": 0.3830969035625458, "learning_rate": 7.867361676481654e-06, "loss": 0.6551, "step": 1618 }, { "epoch": 0.32660986549912213, "grad_norm": 0.44314447045326233, "learning_rate": 7.864684060975699e-06, "loss": 0.9076, "step": 1619 }, { "epoch": 0.32681160105532914, "grad_norm": 0.476024866104126, "learning_rate": 7.86200522190144e-06, "loss": 0.7747, "step": 1620 }, { "epoch": 0.32701333661153614, "grad_norm": 0.44015395641326904, "learning_rate": 7.859325160403073e-06, "loss": 0.6402, "step": 1621 }, { "epoch": 0.3272150721677431, "grad_norm": 0.5018024444580078, "learning_rate": 7.856643877625304e-06, "loss": 0.6621, "step": 1622 }, { "epoch": 0.3274168077239501, "grad_norm": 1.147547721862793, "learning_rate": 7.853961374713367e-06, "loss": 0.712, "step": 1623 }, { "epoch": 0.3276185432801571, "grad_norm": 0.7580029964447021, "learning_rate": 7.851277652813023e-06, "loss": 0.697, "step": 1624 }, { "epoch": 0.3278202788363641, "grad_norm": 1.268939733505249, "learning_rate": 7.848592713070542e-06, "loss": 0.6805, "step": 1625 }, { "epoch": 0.3280220143925711, "grad_norm": 0.46011170744895935, "learning_rate": 7.845906556632721e-06, "loss": 0.6714, "step": 1626 }, { "epoch": 0.32822374994877807, "grad_norm": 4.070643901824951, "learning_rate": 7.843219184646877e-06, "loss": 0.6859, "step": 1627 }, { "epoch": 0.32842548550498507, "grad_norm": 1.8737590312957764, "learning_rate": 7.84053059826084e-06, "loss": 0.6949, "step": 1628 }, { "epoch": 0.3286272210611921, "grad_norm": 0.5250210762023926, "learning_rate": 7.837840798622969e-06, "loss": 0.667, "step": 1629 }, { "epoch": 0.3288289566173991, "grad_norm": 0.40614551305770874, "learning_rate": 7.83514978688213e-06, "loss": 0.7115, "step": 1630 }, { "epoch": 0.32903069217360603, "grad_norm": 0.43287792801856995, "learning_rate": 7.832457564187715e-06, "loss": 0.7591, "step": 1631 }, { "epoch": 0.32923242772981304, "grad_norm": 0.598949134349823, "learning_rate": 7.82976413168963e-06, "loss": 0.7173, "step": 1632 }, { "epoch": 0.32943416328602004, "grad_norm": 1.1374804973602295, "learning_rate": 7.827069490538298e-06, "loss": 0.8068, "step": 1633 }, { "epoch": 0.32963589884222705, "grad_norm": 0.4957578182220459, "learning_rate": 7.82437364188466e-06, "loss": 0.6748, "step": 1634 }, { "epoch": 0.32983763439843405, "grad_norm": 0.5811397433280945, "learning_rate": 7.821676586880167e-06, "loss": 0.7066, "step": 1635 }, { "epoch": 0.330039369954641, "grad_norm": 0.3692084848880768, "learning_rate": 7.818978326676793e-06, "loss": 0.6955, "step": 1636 }, { "epoch": 0.330241105510848, "grad_norm": 0.40660807490348816, "learning_rate": 7.81627886242702e-06, "loss": 0.7185, "step": 1637 }, { "epoch": 0.330442841067055, "grad_norm": 0.31255900859832764, "learning_rate": 7.813578195283852e-06, "loss": 0.6788, "step": 1638 }, { "epoch": 0.330644576623262, "grad_norm": 0.2971351146697998, "learning_rate": 7.810876326400796e-06, "loss": 0.6293, "step": 1639 }, { "epoch": 0.33084631217946897, "grad_norm": 0.36985236406326294, "learning_rate": 7.808173256931883e-06, "loss": 0.8046, "step": 1640 }, { "epoch": 0.331048047735676, "grad_norm": 0.41634565591812134, "learning_rate": 7.805468988031652e-06, "loss": 0.7201, "step": 1641 }, { "epoch": 0.331249783291883, "grad_norm": 0.3758183717727661, "learning_rate": 7.802763520855152e-06, "loss": 0.7648, "step": 1642 }, { "epoch": 0.33145151884809, "grad_norm": 0.426986962556839, "learning_rate": 7.80005685655795e-06, "loss": 0.7397, "step": 1643 }, { "epoch": 0.331653254404297, "grad_norm": 1.3208926916122437, "learning_rate": 7.797348996296116e-06, "loss": 0.671, "step": 1644 }, { "epoch": 0.33185498996050394, "grad_norm": 0.3618689477443695, "learning_rate": 7.794639941226238e-06, "loss": 0.6987, "step": 1645 }, { "epoch": 0.33205672551671095, "grad_norm": 0.7262416481971741, "learning_rate": 7.791929692505411e-06, "loss": 0.6579, "step": 1646 }, { "epoch": 0.33225846107291795, "grad_norm": 0.7027097344398499, "learning_rate": 7.789218251291244e-06, "loss": 0.7859, "step": 1647 }, { "epoch": 0.33246019662912496, "grad_norm": 1.2748175859451294, "learning_rate": 7.786505618741848e-06, "loss": 0.6828, "step": 1648 }, { "epoch": 0.3326619321853319, "grad_norm": 0.4840089976787567, "learning_rate": 7.783791796015848e-06, "loss": 0.6823, "step": 1649 }, { "epoch": 0.3328636677415389, "grad_norm": 0.38921627402305603, "learning_rate": 7.781076784272377e-06, "loss": 0.6761, "step": 1650 }, { "epoch": 0.3330654032977459, "grad_norm": 0.30266091227531433, "learning_rate": 7.778360584671072e-06, "loss": 0.6835, "step": 1651 }, { "epoch": 0.3332671388539529, "grad_norm": 0.43497520685195923, "learning_rate": 7.775643198372085e-06, "loss": 0.7037, "step": 1652 }, { "epoch": 0.33346887441015993, "grad_norm": 0.42013806104660034, "learning_rate": 7.772924626536068e-06, "loss": 0.6906, "step": 1653 }, { "epoch": 0.3336706099663669, "grad_norm": 0.3756752014160156, "learning_rate": 7.770204870324181e-06, "loss": 0.7698, "step": 1654 }, { "epoch": 0.3338723455225739, "grad_norm": 0.5245344042778015, "learning_rate": 7.76748393089809e-06, "loss": 0.651, "step": 1655 }, { "epoch": 0.3340740810787809, "grad_norm": 0.9904852509498596, "learning_rate": 7.764761809419969e-06, "loss": 0.6821, "step": 1656 }, { "epoch": 0.3342758166349879, "grad_norm": 0.3961763083934784, "learning_rate": 7.762038507052494e-06, "loss": 0.6742, "step": 1657 }, { "epoch": 0.33447755219119485, "grad_norm": 0.38408246636390686, "learning_rate": 7.759314024958846e-06, "loss": 0.708, "step": 1658 }, { "epoch": 0.33467928774740185, "grad_norm": 0.8345887660980225, "learning_rate": 7.75658836430271e-06, "loss": 0.642, "step": 1659 }, { "epoch": 0.33488102330360886, "grad_norm": 0.5460199117660522, "learning_rate": 7.753861526248274e-06, "loss": 0.6872, "step": 1660 }, { "epoch": 0.33508275885981587, "grad_norm": 0.6602993607521057, "learning_rate": 7.751133511960228e-06, "loss": 0.7681, "step": 1661 }, { "epoch": 0.33528449441602287, "grad_norm": 1.3037577867507935, "learning_rate": 7.748404322603768e-06, "loss": 0.6629, "step": 1662 }, { "epoch": 0.3354862299722298, "grad_norm": 0.7454012036323547, "learning_rate": 7.74567395934459e-06, "loss": 0.7028, "step": 1663 }, { "epoch": 0.3356879655284368, "grad_norm": 0.7407185435295105, "learning_rate": 7.74294242334889e-06, "loss": 0.8085, "step": 1664 }, { "epoch": 0.33588970108464383, "grad_norm": 1.609656810760498, "learning_rate": 7.740209715783365e-06, "loss": 0.9629, "step": 1665 }, { "epoch": 0.33609143664085084, "grad_norm": 0.6212112307548523, "learning_rate": 7.737475837815215e-06, "loss": 0.6332, "step": 1666 }, { "epoch": 0.3362931721970578, "grad_norm": 0.6354268789291382, "learning_rate": 7.734740790612137e-06, "loss": 0.7686, "step": 1667 }, { "epoch": 0.3364949077532648, "grad_norm": 0.8016752600669861, "learning_rate": 7.732004575342328e-06, "loss": 0.6746, "step": 1668 }, { "epoch": 0.3366966433094718, "grad_norm": 1.6977438926696777, "learning_rate": 7.729267193174483e-06, "loss": 0.6971, "step": 1669 }, { "epoch": 0.3368983788656788, "grad_norm": 0.3582017421722412, "learning_rate": 7.726528645277801e-06, "loss": 0.7068, "step": 1670 }, { "epoch": 0.3371001144218858, "grad_norm": 1.4036459922790527, "learning_rate": 7.723788932821977e-06, "loss": 0.7213, "step": 1671 }, { "epoch": 0.33730184997809276, "grad_norm": 0.34059593081474304, "learning_rate": 7.721048056977192e-06, "loss": 0.644, "step": 1672 }, { "epoch": 0.33750358553429977, "grad_norm": 6.393365383148193, "learning_rate": 7.71830601891414e-06, "loss": 0.671, "step": 1673 }, { "epoch": 0.33770532109050677, "grad_norm": 0.9649179577827454, "learning_rate": 7.715562819804005e-06, "loss": 0.6975, "step": 1674 }, { "epoch": 0.3379070566467138, "grad_norm": 1.1787409782409668, "learning_rate": 7.712818460818464e-06, "loss": 0.8055, "step": 1675 }, { "epoch": 0.3381087922029207, "grad_norm": 0.6837486028671265, "learning_rate": 7.710072943129692e-06, "loss": 0.8101, "step": 1676 }, { "epoch": 0.33831052775912773, "grad_norm": 0.3825688660144806, "learning_rate": 7.707326267910358e-06, "loss": 0.8263, "step": 1677 }, { "epoch": 0.33851226331533474, "grad_norm": 0.3459135591983795, "learning_rate": 7.70457843633363e-06, "loss": 0.6956, "step": 1678 }, { "epoch": 0.33871399887154174, "grad_norm": 0.42721888422966003, "learning_rate": 7.70182944957316e-06, "loss": 0.728, "step": 1679 }, { "epoch": 0.33891573442774875, "grad_norm": 0.41401034593582153, "learning_rate": 7.699079308803105e-06, "loss": 0.7388, "step": 1680 }, { "epoch": 0.3391174699839557, "grad_norm": 0.4155450463294983, "learning_rate": 7.696328015198107e-06, "loss": 0.6918, "step": 1681 }, { "epoch": 0.3393192055401627, "grad_norm": 0.9530971646308899, "learning_rate": 7.693575569933302e-06, "loss": 0.6704, "step": 1682 }, { "epoch": 0.3395209410963697, "grad_norm": 0.46446821093559265, "learning_rate": 7.69082197418432e-06, "loss": 0.7255, "step": 1683 }, { "epoch": 0.3397226766525767, "grad_norm": 0.8986880779266357, "learning_rate": 7.688067229127283e-06, "loss": 0.6523, "step": 1684 }, { "epoch": 0.33992441220878367, "grad_norm": 0.5720349550247192, "learning_rate": 7.685311335938797e-06, "loss": 0.761, "step": 1685 }, { "epoch": 0.34012614776499067, "grad_norm": 0.4581213593482971, "learning_rate": 7.682554295795968e-06, "loss": 0.7814, "step": 1686 }, { "epoch": 0.3403278833211977, "grad_norm": 0.40103885531425476, "learning_rate": 7.679796109876385e-06, "loss": 0.6647, "step": 1687 }, { "epoch": 0.3405296188774047, "grad_norm": 0.388505756855011, "learning_rate": 7.67703677935813e-06, "loss": 0.6565, "step": 1688 }, { "epoch": 0.3407313544336117, "grad_norm": 0.36097803711891174, "learning_rate": 7.67427630541977e-06, "loss": 0.7654, "step": 1689 }, { "epoch": 0.34093308998981864, "grad_norm": 0.3629458546638489, "learning_rate": 7.671514689240366e-06, "loss": 0.6841, "step": 1690 }, { "epoch": 0.34113482554602564, "grad_norm": 0.6155198216438293, "learning_rate": 7.668751931999464e-06, "loss": 0.7549, "step": 1691 }, { "epoch": 0.34133656110223265, "grad_norm": 0.4708308279514313, "learning_rate": 7.665988034877093e-06, "loss": 0.6257, "step": 1692 }, { "epoch": 0.34153829665843966, "grad_norm": 0.59466552734375, "learning_rate": 7.663222999053774e-06, "loss": 0.7247, "step": 1693 }, { "epoch": 0.3417400322146466, "grad_norm": 0.4716358482837677, "learning_rate": 7.660456825710518e-06, "loss": 0.6991, "step": 1694 }, { "epoch": 0.3419417677708536, "grad_norm": 0.808247983455658, "learning_rate": 7.657689516028814e-06, "loss": 0.6722, "step": 1695 }, { "epoch": 0.3421435033270606, "grad_norm": 0.7132179141044617, "learning_rate": 7.654921071190637e-06, "loss": 0.7261, "step": 1696 }, { "epoch": 0.3423452388832676, "grad_norm": 0.4227313995361328, "learning_rate": 7.652151492378455e-06, "loss": 0.6658, "step": 1697 }, { "epoch": 0.3425469744394746, "grad_norm": 0.32644885778427124, "learning_rate": 7.649380780775211e-06, "loss": 0.6655, "step": 1698 }, { "epoch": 0.3427487099956816, "grad_norm": 1.1053694486618042, "learning_rate": 7.646608937564338e-06, "loss": 0.6884, "step": 1699 }, { "epoch": 0.3429504455518886, "grad_norm": 0.3581966757774353, "learning_rate": 7.643835963929747e-06, "loss": 0.6848, "step": 1700 }, { "epoch": 0.3431521811080956, "grad_norm": 0.5573354959487915, "learning_rate": 7.641061861055837e-06, "loss": 0.802, "step": 1701 }, { "epoch": 0.3433539166643026, "grad_norm": 0.8281745910644531, "learning_rate": 7.638286630127487e-06, "loss": 0.6741, "step": 1702 }, { "epoch": 0.34355565222050954, "grad_norm": 0.5986734628677368, "learning_rate": 7.635510272330058e-06, "loss": 0.6812, "step": 1703 }, { "epoch": 0.34375738777671655, "grad_norm": 0.587714672088623, "learning_rate": 7.63273278884939e-06, "loss": 0.7008, "step": 1704 }, { "epoch": 0.34395912333292356, "grad_norm": 0.6945699453353882, "learning_rate": 7.62995418087181e-06, "loss": 0.6674, "step": 1705 }, { "epoch": 0.34416085888913056, "grad_norm": 2.0080175399780273, "learning_rate": 7.6271744495841185e-06, "loss": 0.677, "step": 1706 }, { "epoch": 0.34436259444533757, "grad_norm": 1.2509560585021973, "learning_rate": 7.624393596173598e-06, "loss": 0.682, "step": 1707 }, { "epoch": 0.3445643300015445, "grad_norm": 0.977975606918335, "learning_rate": 7.621611621828016e-06, "loss": 0.7995, "step": 1708 }, { "epoch": 0.3447660655577515, "grad_norm": 0.7222344875335693, "learning_rate": 7.618828527735607e-06, "loss": 0.8359, "step": 1709 }, { "epoch": 0.3449678011139585, "grad_norm": 0.7610874772071838, "learning_rate": 7.616044315085092e-06, "loss": 0.6729, "step": 1710 }, { "epoch": 0.34516953667016553, "grad_norm": 0.5035162568092346, "learning_rate": 7.613258985065672e-06, "loss": 0.7004, "step": 1711 }, { "epoch": 0.34537127222637254, "grad_norm": 0.31400299072265625, "learning_rate": 7.61047253886702e-06, "loss": 0.6807, "step": 1712 }, { "epoch": 0.3455730077825795, "grad_norm": 0.41363200545310974, "learning_rate": 7.607684977679284e-06, "loss": 0.7384, "step": 1713 }, { "epoch": 0.3457747433387865, "grad_norm": 0.42264917492866516, "learning_rate": 7.604896302693094e-06, "loss": 0.679, "step": 1714 }, { "epoch": 0.3459764788949935, "grad_norm": 0.8459030985832214, "learning_rate": 7.602106515099554e-06, "loss": 0.6999, "step": 1715 }, { "epoch": 0.3461782144512005, "grad_norm": 0.5736342668533325, "learning_rate": 7.599315616090242e-06, "loss": 0.7783, "step": 1716 }, { "epoch": 0.34637995000740746, "grad_norm": 0.42105481028556824, "learning_rate": 7.596523606857209e-06, "loss": 0.6519, "step": 1717 }, { "epoch": 0.34658168556361446, "grad_norm": 0.40403833985328674, "learning_rate": 7.593730488592985e-06, "loss": 0.7062, "step": 1718 }, { "epoch": 0.34678342111982147, "grad_norm": 0.51008141040802, "learning_rate": 7.590936262490569e-06, "loss": 0.7837, "step": 1719 }, { "epoch": 0.34698515667602847, "grad_norm": 0.680470883846283, "learning_rate": 7.588140929743437e-06, "loss": 0.6716, "step": 1720 }, { "epoch": 0.3471868922322355, "grad_norm": 0.3377155661582947, "learning_rate": 7.585344491545535e-06, "loss": 0.7046, "step": 1721 }, { "epoch": 0.34738862778844243, "grad_norm": 0.3631691336631775, "learning_rate": 7.58254694909128e-06, "loss": 0.6441, "step": 1722 }, { "epoch": 0.34759036334464943, "grad_norm": 0.4910680055618286, "learning_rate": 7.579748303575567e-06, "loss": 0.6494, "step": 1723 }, { "epoch": 0.34779209890085644, "grad_norm": 0.33177176117897034, "learning_rate": 7.576948556193755e-06, "loss": 0.6973, "step": 1724 }, { "epoch": 0.34799383445706344, "grad_norm": 0.4534423351287842, "learning_rate": 7.574147708141675e-06, "loss": 0.6672, "step": 1725 }, { "epoch": 0.3481955700132704, "grad_norm": 0.7091371417045593, "learning_rate": 7.5713457606156335e-06, "loss": 0.6313, "step": 1726 }, { "epoch": 0.3483973055694774, "grad_norm": 0.38050803542137146, "learning_rate": 7.568542714812401e-06, "loss": 0.7431, "step": 1727 }, { "epoch": 0.3485990411256844, "grad_norm": 0.7393271327018738, "learning_rate": 7.565738571929217e-06, "loss": 0.7006, "step": 1728 }, { "epoch": 0.3488007766818914, "grad_norm": 0.35073035955429077, "learning_rate": 7.562933333163792e-06, "loss": 0.6593, "step": 1729 }, { "epoch": 0.3490025122380984, "grad_norm": 0.9442110657691956, "learning_rate": 7.5601269997143055e-06, "loss": 0.6577, "step": 1730 }, { "epoch": 0.34920424779430537, "grad_norm": 1.0707677602767944, "learning_rate": 7.557319572779402e-06, "loss": 0.6477, "step": 1731 }, { "epoch": 0.3494059833505124, "grad_norm": 0.3912695050239563, "learning_rate": 7.554511053558196e-06, "loss": 1.0967, "step": 1732 }, { "epoch": 0.3496077189067194, "grad_norm": 0.7106683850288391, "learning_rate": 7.551701443250263e-06, "loss": 0.8774, "step": 1733 }, { "epoch": 0.3498094544629264, "grad_norm": 0.7584543824195862, "learning_rate": 7.54889074305565e-06, "loss": 0.7291, "step": 1734 }, { "epoch": 0.35001119001913333, "grad_norm": 0.3926626145839691, "learning_rate": 7.546078954174868e-06, "loss": 0.718, "step": 1735 }, { "epoch": 0.35021292557534034, "grad_norm": 1.3000050783157349, "learning_rate": 7.543266077808893e-06, "loss": 0.7198, "step": 1736 }, { "epoch": 0.35041466113154734, "grad_norm": 0.4094867706298828, "learning_rate": 7.540452115159163e-06, "loss": 0.7017, "step": 1737 }, { "epoch": 0.35061639668775435, "grad_norm": 0.3614434003829956, "learning_rate": 7.5376370674275834e-06, "loss": 0.6872, "step": 1738 }, { "epoch": 0.35081813224396136, "grad_norm": 1.8085640668869019, "learning_rate": 7.5348209358165225e-06, "loss": 0.7319, "step": 1739 }, { "epoch": 0.3510198678001683, "grad_norm": 0.7284565567970276, "learning_rate": 7.53200372152881e-06, "loss": 0.6596, "step": 1740 }, { "epoch": 0.3512216033563753, "grad_norm": 0.4147509038448334, "learning_rate": 7.529185425767738e-06, "loss": 0.7772, "step": 1741 }, { "epoch": 0.3514233389125823, "grad_norm": 0.35822349786758423, "learning_rate": 7.526366049737063e-06, "loss": 0.7323, "step": 1742 }, { "epoch": 0.3516250744687893, "grad_norm": 0.38672930002212524, "learning_rate": 7.523545594641001e-06, "loss": 0.8405, "step": 1743 }, { "epoch": 0.3518268100249963, "grad_norm": 0.3860418498516083, "learning_rate": 7.520724061684227e-06, "loss": 0.6463, "step": 1744 }, { "epoch": 0.3520285455812033, "grad_norm": 0.5697163343429565, "learning_rate": 7.51790145207188e-06, "loss": 0.701, "step": 1745 }, { "epoch": 0.3522302811374103, "grad_norm": 0.6017259955406189, "learning_rate": 7.51507776700956e-06, "loss": 0.6839, "step": 1746 }, { "epoch": 0.3524320166936173, "grad_norm": 0.46007320284843445, "learning_rate": 7.512253007703321e-06, "loss": 0.7561, "step": 1747 }, { "epoch": 0.3526337522498243, "grad_norm": 0.740803062915802, "learning_rate": 7.509427175359678e-06, "loss": 0.6879, "step": 1748 }, { "epoch": 0.35283548780603124, "grad_norm": 0.4627668857574463, "learning_rate": 7.506600271185605e-06, "loss": 0.6998, "step": 1749 }, { "epoch": 0.35303722336223825, "grad_norm": 0.3997352123260498, "learning_rate": 7.503772296388536e-06, "loss": 0.6721, "step": 1750 }, { "epoch": 0.35323895891844526, "grad_norm": 0.49099141359329224, "learning_rate": 7.500943252176359e-06, "loss": 0.7366, "step": 1751 }, { "epoch": 0.35344069447465226, "grad_norm": 0.3792705833911896, "learning_rate": 7.498113139757418e-06, "loss": 0.6561, "step": 1752 }, { "epoch": 0.3536424300308592, "grad_norm": 0.5688077211380005, "learning_rate": 7.4952819603405155e-06, "loss": 0.6516, "step": 1753 }, { "epoch": 0.3538441655870662, "grad_norm": 0.5022661089897156, "learning_rate": 7.492449715134912e-06, "loss": 0.7017, "step": 1754 }, { "epoch": 0.3540459011432732, "grad_norm": 0.4993700683116913, "learning_rate": 7.489616405350319e-06, "loss": 0.6686, "step": 1755 }, { "epoch": 0.35424763669948023, "grad_norm": 1.4127341508865356, "learning_rate": 7.4867820321969005e-06, "loss": 0.8506, "step": 1756 }, { "epoch": 0.35444937225568723, "grad_norm": 0.5744199752807617, "learning_rate": 7.483946596885283e-06, "loss": 0.6632, "step": 1757 }, { "epoch": 0.3546511078118942, "grad_norm": 0.5674511194229126, "learning_rate": 7.481110100626542e-06, "loss": 0.6733, "step": 1758 }, { "epoch": 0.3548528433681012, "grad_norm": 0.45668545365333557, "learning_rate": 7.478272544632204e-06, "loss": 0.6698, "step": 1759 }, { "epoch": 0.3550545789243082, "grad_norm": 0.4792921841144562, "learning_rate": 7.47543393011425e-06, "loss": 0.6587, "step": 1760 }, { "epoch": 0.3552563144805152, "grad_norm": 0.6189489960670471, "learning_rate": 7.472594258285115e-06, "loss": 1.0455, "step": 1761 }, { "epoch": 0.35545805003672215, "grad_norm": 0.5455827116966248, "learning_rate": 7.469753530357684e-06, "loss": 0.7175, "step": 1762 }, { "epoch": 0.35565978559292916, "grad_norm": 0.7381955981254578, "learning_rate": 7.466911747545291e-06, "loss": 0.6858, "step": 1763 }, { "epoch": 0.35586152114913616, "grad_norm": 1.1056345701217651, "learning_rate": 7.464068911061726e-06, "loss": 0.6447, "step": 1764 }, { "epoch": 0.35606325670534317, "grad_norm": 2.1146862506866455, "learning_rate": 7.461225022121223e-06, "loss": 0.6942, "step": 1765 }, { "epoch": 0.3562649922615502, "grad_norm": 1.0887856483459473, "learning_rate": 7.45838008193847e-06, "loss": 0.7291, "step": 1766 }, { "epoch": 0.3564667278177571, "grad_norm": 0.6987454891204834, "learning_rate": 7.455534091728603e-06, "loss": 0.6633, "step": 1767 }, { "epoch": 0.35666846337396413, "grad_norm": 0.9622281789779663, "learning_rate": 7.452687052707201e-06, "loss": 0.7049, "step": 1768 }, { "epoch": 0.35687019893017113, "grad_norm": 0.8182410597801208, "learning_rate": 7.4498389660903025e-06, "loss": 1.1296, "step": 1769 }, { "epoch": 0.35707193448637814, "grad_norm": 1.42489755153656, "learning_rate": 7.446989833094381e-06, "loss": 0.7751, "step": 1770 }, { "epoch": 0.3572736700425851, "grad_norm": 0.5022891759872437, "learning_rate": 7.444139654936367e-06, "loss": 0.6574, "step": 1771 }, { "epoch": 0.3574754055987921, "grad_norm": 0.913811445236206, "learning_rate": 7.441288432833628e-06, "loss": 0.7729, "step": 1772 }, { "epoch": 0.3576771411549991, "grad_norm": 0.987787127494812, "learning_rate": 7.438436168003987e-06, "loss": 0.6526, "step": 1773 }, { "epoch": 0.3578788767112061, "grad_norm": 0.8746455311775208, "learning_rate": 7.435582861665705e-06, "loss": 0.678, "step": 1774 }, { "epoch": 0.3580806122674131, "grad_norm": 0.4354274868965149, "learning_rate": 7.432728515037494e-06, "loss": 0.7056, "step": 1775 }, { "epoch": 0.35828234782362006, "grad_norm": 0.3449755311012268, "learning_rate": 7.429873129338503e-06, "loss": 0.814, "step": 1776 }, { "epoch": 0.35848408337982707, "grad_norm": 0.741226851940155, "learning_rate": 7.4270167057883295e-06, "loss": 0.7221, "step": 1777 }, { "epoch": 0.3586858189360341, "grad_norm": 1.018054723739624, "learning_rate": 7.424159245607016e-06, "loss": 0.816, "step": 1778 }, { "epoch": 0.3588875544922411, "grad_norm": 0.4770512282848358, "learning_rate": 7.421300750015043e-06, "loss": 0.8044, "step": 1779 }, { "epoch": 0.35908929004844803, "grad_norm": 0.7181556820869446, "learning_rate": 7.418441220233336e-06, "loss": 0.6637, "step": 1780 }, { "epoch": 0.35929102560465503, "grad_norm": 0.49955129623413086, "learning_rate": 7.415580657483263e-06, "loss": 0.6818, "step": 1781 }, { "epoch": 0.35949276116086204, "grad_norm": 0.6910608410835266, "learning_rate": 7.412719062986632e-06, "loss": 0.7449, "step": 1782 }, { "epoch": 0.35969449671706905, "grad_norm": 0.35166415572166443, "learning_rate": 7.40985643796569e-06, "loss": 0.6331, "step": 1783 }, { "epoch": 0.35989623227327605, "grad_norm": 0.5472837090492249, "learning_rate": 7.406992783643127e-06, "loss": 0.6728, "step": 1784 }, { "epoch": 0.360097967829483, "grad_norm": 0.5765179395675659, "learning_rate": 7.4041281012420695e-06, "loss": 0.652, "step": 1785 }, { "epoch": 0.36029970338569, "grad_norm": 0.642083466053009, "learning_rate": 7.401262391986088e-06, "loss": 0.7795, "step": 1786 }, { "epoch": 0.360501438941897, "grad_norm": 0.39941856265068054, "learning_rate": 7.398395657099189e-06, "loss": 0.662, "step": 1787 }, { "epoch": 0.360703174498104, "grad_norm": 0.4457746148109436, "learning_rate": 7.395527897805812e-06, "loss": 0.6827, "step": 1788 }, { "epoch": 0.36090491005431097, "grad_norm": 0.44422203302383423, "learning_rate": 7.392659115330844e-06, "loss": 0.7186, "step": 1789 }, { "epoch": 0.361106645610518, "grad_norm": 0.4708141088485718, "learning_rate": 7.389789310899602e-06, "loss": 0.6707, "step": 1790 }, { "epoch": 0.361308381166725, "grad_norm": 0.7885355353355408, "learning_rate": 7.38691848573784e-06, "loss": 0.651, "step": 1791 }, { "epoch": 0.361510116722932, "grad_norm": 0.7183753848075867, "learning_rate": 7.3840466410717505e-06, "loss": 0.6756, "step": 1792 }, { "epoch": 0.361711852279139, "grad_norm": 0.8749204277992249, "learning_rate": 7.381173778127961e-06, "loss": 0.6698, "step": 1793 }, { "epoch": 0.36191358783534594, "grad_norm": 0.4229263961315155, "learning_rate": 7.378299898133533e-06, "loss": 0.6847, "step": 1794 }, { "epoch": 0.36211532339155295, "grad_norm": 1.5481194257736206, "learning_rate": 7.3754250023159615e-06, "loss": 0.9223, "step": 1795 }, { "epoch": 0.36231705894775995, "grad_norm": 0.6774659752845764, "learning_rate": 7.372549091903175e-06, "loss": 0.6662, "step": 1796 }, { "epoch": 0.36251879450396696, "grad_norm": 0.4385301470756531, "learning_rate": 7.36967216812354e-06, "loss": 0.7054, "step": 1797 }, { "epoch": 0.3627205300601739, "grad_norm": 0.7024504542350769, "learning_rate": 7.366794232205852e-06, "loss": 0.7764, "step": 1798 }, { "epoch": 0.3629222656163809, "grad_norm": 0.4045291543006897, "learning_rate": 7.36391528537934e-06, "loss": 0.6346, "step": 1799 }, { "epoch": 0.3631240011725879, "grad_norm": 0.39111268520355225, "learning_rate": 7.36103532887366e-06, "loss": 0.7232, "step": 1800 }, { "epoch": 0.3633257367287949, "grad_norm": 0.44023269414901733, "learning_rate": 7.358154363918909e-06, "loss": 0.6935, "step": 1801 }, { "epoch": 0.36352747228500193, "grad_norm": 0.3415777087211609, "learning_rate": 7.355272391745605e-06, "loss": 0.6837, "step": 1802 }, { "epoch": 0.3637292078412089, "grad_norm": 0.32775235176086426, "learning_rate": 7.352389413584704e-06, "loss": 0.7245, "step": 1803 }, { "epoch": 0.3639309433974159, "grad_norm": 0.4504493176937103, "learning_rate": 7.349505430667585e-06, "loss": 0.6969, "step": 1804 }, { "epoch": 0.3641326789536229, "grad_norm": 0.5969768762588501, "learning_rate": 7.3466204442260605e-06, "loss": 0.6916, "step": 1805 }, { "epoch": 0.3643344145098299, "grad_norm": 0.5859495401382446, "learning_rate": 7.343734455492372e-06, "loss": 0.7688, "step": 1806 }, { "epoch": 0.36453615006603685, "grad_norm": 0.431937038898468, "learning_rate": 7.340847465699186e-06, "loss": 0.6623, "step": 1807 }, { "epoch": 0.36473788562224385, "grad_norm": 0.4382317364215851, "learning_rate": 7.3379594760795955e-06, "loss": 0.684, "step": 1808 }, { "epoch": 0.36493962117845086, "grad_norm": 0.33924439549446106, "learning_rate": 7.335070487867127e-06, "loss": 0.6919, "step": 1809 }, { "epoch": 0.36514135673465786, "grad_norm": 0.8796870112419128, "learning_rate": 7.332180502295729e-06, "loss": 0.7005, "step": 1810 }, { "epoch": 0.36534309229086487, "grad_norm": 1.7464920282363892, "learning_rate": 7.329289520599776e-06, "loss": 0.6226, "step": 1811 }, { "epoch": 0.3655448278470718, "grad_norm": 0.5517187714576721, "learning_rate": 7.326397544014065e-06, "loss": 0.6956, "step": 1812 }, { "epoch": 0.3657465634032788, "grad_norm": 0.4252607226371765, "learning_rate": 7.32350457377383e-06, "loss": 0.687, "step": 1813 }, { "epoch": 0.36594829895948583, "grad_norm": 0.49049055576324463, "learning_rate": 7.320610611114713e-06, "loss": 0.7759, "step": 1814 }, { "epoch": 0.36615003451569283, "grad_norm": 0.3037787079811096, "learning_rate": 7.317715657272793e-06, "loss": 0.7135, "step": 1815 }, { "epoch": 0.36635177007189984, "grad_norm": 2.116178274154663, "learning_rate": 7.314819713484561e-06, "loss": 0.6389, "step": 1816 }, { "epoch": 0.3665535056281068, "grad_norm": 0.47844094038009644, "learning_rate": 7.3119227809869445e-06, "loss": 0.7052, "step": 1817 }, { "epoch": 0.3667552411843138, "grad_norm": 0.5032902359962463, "learning_rate": 7.309024861017281e-06, "loss": 0.6915, "step": 1818 }, { "epoch": 0.3669569767405208, "grad_norm": 2.9803664684295654, "learning_rate": 7.306125954813335e-06, "loss": 0.6944, "step": 1819 }, { "epoch": 0.3671587122967278, "grad_norm": 1.1328582763671875, "learning_rate": 7.303226063613293e-06, "loss": 0.7092, "step": 1820 }, { "epoch": 0.36736044785293476, "grad_norm": 1.1794075965881348, "learning_rate": 7.300325188655762e-06, "loss": 0.7334, "step": 1821 }, { "epoch": 0.36756218340914176, "grad_norm": 0.4251624643802643, "learning_rate": 7.297423331179766e-06, "loss": 0.6757, "step": 1822 }, { "epoch": 0.36776391896534877, "grad_norm": 0.35774368047714233, "learning_rate": 7.294520492424752e-06, "loss": 0.701, "step": 1823 }, { "epoch": 0.3679656545215558, "grad_norm": 0.34498193860054016, "learning_rate": 7.291616673630583e-06, "loss": 0.8526, "step": 1824 }, { "epoch": 0.3681673900777628, "grad_norm": 0.5169105529785156, "learning_rate": 7.288711876037546e-06, "loss": 0.7012, "step": 1825 }, { "epoch": 0.36836912563396973, "grad_norm": 0.7580662369728088, "learning_rate": 7.28580610088634e-06, "loss": 0.7087, "step": 1826 }, { "epoch": 0.36857086119017674, "grad_norm": 0.5409523844718933, "learning_rate": 7.282899349418086e-06, "loss": 0.6959, "step": 1827 }, { "epoch": 0.36877259674638374, "grad_norm": 0.44074133038520813, "learning_rate": 7.279991622874319e-06, "loss": 0.641, "step": 1828 }, { "epoch": 0.36897433230259075, "grad_norm": 0.45611658692359924, "learning_rate": 7.277082922496993e-06, "loss": 0.8013, "step": 1829 }, { "epoch": 0.3691760678587977, "grad_norm": 2.360243558883667, "learning_rate": 7.2741732495284745e-06, "loss": 0.675, "step": 1830 }, { "epoch": 0.3693778034150047, "grad_norm": 0.5340917110443115, "learning_rate": 7.27126260521155e-06, "loss": 0.7439, "step": 1831 }, { "epoch": 0.3695795389712117, "grad_norm": 0.8571584224700928, "learning_rate": 7.268350990789415e-06, "loss": 0.737, "step": 1832 }, { "epoch": 0.3697812745274187, "grad_norm": 1.4758659601211548, "learning_rate": 7.265438407505686e-06, "loss": 0.852, "step": 1833 }, { "epoch": 0.3699830100836257, "grad_norm": 0.7456501722335815, "learning_rate": 7.262524856604389e-06, "loss": 0.6932, "step": 1834 }, { "epoch": 0.37018474563983267, "grad_norm": 0.36184829473495483, "learning_rate": 7.259610339329965e-06, "loss": 0.651, "step": 1835 }, { "epoch": 0.3703864811960397, "grad_norm": 0.3770706057548523, "learning_rate": 7.256694856927267e-06, "loss": 0.7797, "step": 1836 }, { "epoch": 0.3705882167522467, "grad_norm": 0.4544236361980438, "learning_rate": 7.253778410641557e-06, "loss": 0.6685, "step": 1837 }, { "epoch": 0.3707899523084537, "grad_norm": 0.3603919446468353, "learning_rate": 7.2508610017185175e-06, "loss": 0.8271, "step": 1838 }, { "epoch": 0.37099168786466064, "grad_norm": 0.5313817858695984, "learning_rate": 7.247942631404232e-06, "loss": 0.6747, "step": 1839 }, { "epoch": 0.37119342342086764, "grad_norm": 0.3739321231842041, "learning_rate": 7.245023300945203e-06, "loss": 0.7191, "step": 1840 }, { "epoch": 0.37139515897707465, "grad_norm": 0.3855895400047302, "learning_rate": 7.242103011588339e-06, "loss": 0.6782, "step": 1841 }, { "epoch": 0.37159689453328165, "grad_norm": 0.4556311070919037, "learning_rate": 7.239181764580956e-06, "loss": 0.6905, "step": 1842 }, { "epoch": 0.37179863008948866, "grad_norm": 0.4981215000152588, "learning_rate": 7.236259561170783e-06, "loss": 0.6933, "step": 1843 }, { "epoch": 0.3720003656456956, "grad_norm": 0.47854724526405334, "learning_rate": 7.233336402605956e-06, "loss": 0.8892, "step": 1844 }, { "epoch": 0.3722021012019026, "grad_norm": 0.6122666597366333, "learning_rate": 7.23041229013502e-06, "loss": 0.668, "step": 1845 }, { "epoch": 0.3724038367581096, "grad_norm": 0.5111199021339417, "learning_rate": 7.227487225006926e-06, "loss": 0.8229, "step": 1846 }, { "epoch": 0.3726055723143166, "grad_norm": 0.6023356914520264, "learning_rate": 7.22456120847103e-06, "loss": 1.0255, "step": 1847 }, { "epoch": 0.3728073078705236, "grad_norm": 0.4147641062736511, "learning_rate": 7.2216342417771e-06, "loss": 0.6807, "step": 1848 }, { "epoch": 0.3730090434267306, "grad_norm": 0.9486438632011414, "learning_rate": 7.218706326175304e-06, "loss": 0.6871, "step": 1849 }, { "epoch": 0.3732107789829376, "grad_norm": 0.8997410535812378, "learning_rate": 7.215777462916221e-06, "loss": 0.8113, "step": 1850 }, { "epoch": 0.3734125145391446, "grad_norm": 1.2337889671325684, "learning_rate": 7.212847653250828e-06, "loss": 0.7935, "step": 1851 }, { "epoch": 0.3736142500953516, "grad_norm": 0.511205792427063, "learning_rate": 7.2099168984305124e-06, "loss": 0.7142, "step": 1852 }, { "epoch": 0.37381598565155855, "grad_norm": 3.531510353088379, "learning_rate": 7.206985199707062e-06, "loss": 0.6798, "step": 1853 }, { "epoch": 0.37401772120776555, "grad_norm": 1.0379084348678589, "learning_rate": 7.204052558332668e-06, "loss": 0.7005, "step": 1854 }, { "epoch": 0.37421945676397256, "grad_norm": 2.9178311824798584, "learning_rate": 7.2011189755599255e-06, "loss": 0.6898, "step": 1855 }, { "epoch": 0.37442119232017956, "grad_norm": 2.786430597305298, "learning_rate": 7.19818445264183e-06, "loss": 0.6328, "step": 1856 }, { "epoch": 0.3746229278763865, "grad_norm": 3.562833070755005, "learning_rate": 7.19524899083178e-06, "loss": 0.6735, "step": 1857 }, { "epoch": 0.3748246634325935, "grad_norm": 1.2733399868011475, "learning_rate": 7.192312591383575e-06, "loss": 0.6778, "step": 1858 }, { "epoch": 0.3750263989888005, "grad_norm": 0.535793662071228, "learning_rate": 7.189375255551413e-06, "loss": 0.8744, "step": 1859 }, { "epoch": 0.37522813454500753, "grad_norm": 1.285027265548706, "learning_rate": 7.186436984589895e-06, "loss": 0.9023, "step": 1860 }, { "epoch": 0.37542987010121454, "grad_norm": 0.45449426770210266, "learning_rate": 7.18349777975402e-06, "loss": 0.9073, "step": 1861 }, { "epoch": 0.3756316056574215, "grad_norm": 0.36496907472610474, "learning_rate": 7.180557642299184e-06, "loss": 0.7054, "step": 1862 }, { "epoch": 0.3758333412136285, "grad_norm": 0.6334181427955627, "learning_rate": 7.177616573481185e-06, "loss": 0.7651, "step": 1863 }, { "epoch": 0.3760350767698355, "grad_norm": 0.410114586353302, "learning_rate": 7.1746745745562165e-06, "loss": 0.6655, "step": 1864 }, { "epoch": 0.3762368123260425, "grad_norm": 0.35170572996139526, "learning_rate": 7.171731646780867e-06, "loss": 0.692, "step": 1865 }, { "epoch": 0.37643854788224945, "grad_norm": 0.4408973455429077, "learning_rate": 7.168787791412128e-06, "loss": 0.6569, "step": 1866 }, { "epoch": 0.37664028343845646, "grad_norm": 0.3591359257698059, "learning_rate": 7.165843009707383e-06, "loss": 0.6774, "step": 1867 }, { "epoch": 0.37684201899466346, "grad_norm": 0.48465201258659363, "learning_rate": 7.162897302924409e-06, "loss": 0.9298, "step": 1868 }, { "epoch": 0.37704375455087047, "grad_norm": 0.4542745351791382, "learning_rate": 7.1599506723213845e-06, "loss": 0.6479, "step": 1869 }, { "epoch": 0.3772454901070775, "grad_norm": 0.4889806807041168, "learning_rate": 7.157003119156876e-06, "loss": 0.8561, "step": 1870 }, { "epoch": 0.3774472256632844, "grad_norm": 0.40656688809394836, "learning_rate": 7.154054644689847e-06, "loss": 0.7979, "step": 1871 }, { "epoch": 0.37764896121949143, "grad_norm": 0.419414758682251, "learning_rate": 7.151105250179658e-06, "loss": 0.6655, "step": 1872 }, { "epoch": 0.37785069677569844, "grad_norm": 0.4601416289806366, "learning_rate": 7.1481549368860545e-06, "loss": 0.7036, "step": 1873 }, { "epoch": 0.37805243233190544, "grad_norm": 0.5763274431228638, "learning_rate": 7.145203706069183e-06, "loss": 0.6789, "step": 1874 }, { "epoch": 0.3782541678881124, "grad_norm": 0.42279934883117676, "learning_rate": 7.142251558989573e-06, "loss": 0.6988, "step": 1875 }, { "epoch": 0.3784559034443194, "grad_norm": 0.3470363914966583, "learning_rate": 7.139298496908155e-06, "loss": 0.8392, "step": 1876 }, { "epoch": 0.3786576390005264, "grad_norm": 0.34533339738845825, "learning_rate": 7.136344521086242e-06, "loss": 0.6961, "step": 1877 }, { "epoch": 0.3788593745567334, "grad_norm": 0.4589783847332001, "learning_rate": 7.133389632785543e-06, "loss": 0.868, "step": 1878 }, { "epoch": 0.3790611101129404, "grad_norm": 0.9338458776473999, "learning_rate": 7.1304338332681534e-06, "loss": 0.7152, "step": 1879 }, { "epoch": 0.37926284566914736, "grad_norm": 0.48263630270957947, "learning_rate": 7.127477123796559e-06, "loss": 0.8125, "step": 1880 }, { "epoch": 0.37946458122535437, "grad_norm": 0.443023145198822, "learning_rate": 7.124519505633633e-06, "loss": 0.6722, "step": 1881 }, { "epoch": 0.3796663167815614, "grad_norm": 0.45879748463630676, "learning_rate": 7.121560980042641e-06, "loss": 0.8428, "step": 1882 }, { "epoch": 0.3798680523377684, "grad_norm": 0.5562877655029297, "learning_rate": 7.11860154828723e-06, "loss": 0.8586, "step": 1883 }, { "epoch": 0.38006978789397533, "grad_norm": 0.6089495420455933, "learning_rate": 7.1156412116314374e-06, "loss": 0.7096, "step": 1884 }, { "epoch": 0.38027152345018234, "grad_norm": 0.45152896642684937, "learning_rate": 7.112679971339689e-06, "loss": 0.6575, "step": 1885 }, { "epoch": 0.38047325900638934, "grad_norm": 0.5453755259513855, "learning_rate": 7.109717828676792e-06, "loss": 0.6727, "step": 1886 }, { "epoch": 0.38067499456259635, "grad_norm": 0.33480140566825867, "learning_rate": 7.106754784907942e-06, "loss": 0.6577, "step": 1887 }, { "epoch": 0.38087673011880335, "grad_norm": 0.44161325693130493, "learning_rate": 7.10379084129872e-06, "loss": 0.704, "step": 1888 }, { "epoch": 0.3810784656750103, "grad_norm": 0.4320142865180969, "learning_rate": 7.100825999115089e-06, "loss": 0.6513, "step": 1889 }, { "epoch": 0.3812802012312173, "grad_norm": 0.38756075501441956, "learning_rate": 7.097860259623397e-06, "loss": 0.7126, "step": 1890 }, { "epoch": 0.3814819367874243, "grad_norm": 0.45298585295677185, "learning_rate": 7.094893624090375e-06, "loss": 0.6851, "step": 1891 }, { "epoch": 0.3816836723436313, "grad_norm": 0.3509814143180847, "learning_rate": 7.091926093783139e-06, "loss": 0.6829, "step": 1892 }, { "epoch": 0.38188540789983827, "grad_norm": 0.5120080709457397, "learning_rate": 7.088957669969182e-06, "loss": 0.7134, "step": 1893 }, { "epoch": 0.3820871434560453, "grad_norm": 0.389632910490036, "learning_rate": 7.085988353916385e-06, "loss": 0.7099, "step": 1894 }, { "epoch": 0.3822888790122523, "grad_norm": 0.39382511377334595, "learning_rate": 7.083018146893003e-06, "loss": 0.6807, "step": 1895 }, { "epoch": 0.3824906145684593, "grad_norm": 0.5153424739837646, "learning_rate": 7.08004705016768e-06, "loss": 0.6804, "step": 1896 }, { "epoch": 0.3826923501246663, "grad_norm": 0.37517619132995605, "learning_rate": 7.0770750650094335e-06, "loss": 0.7282, "step": 1897 }, { "epoch": 0.38289408568087324, "grad_norm": 0.705549418926239, "learning_rate": 7.07410219268766e-06, "loss": 0.6881, "step": 1898 }, { "epoch": 0.38309582123708025, "grad_norm": 0.8489635586738586, "learning_rate": 7.071128434472141e-06, "loss": 0.7333, "step": 1899 }, { "epoch": 0.38329755679328725, "grad_norm": 0.7200409173965454, "learning_rate": 7.06815379163303e-06, "loss": 0.6978, "step": 1900 }, { "epoch": 0.38349929234949426, "grad_norm": 0.6117933392524719, "learning_rate": 7.065178265440864e-06, "loss": 0.6218, "step": 1901 }, { "epoch": 0.3837010279057012, "grad_norm": 0.41154125332832336, "learning_rate": 7.0622018571665514e-06, "loss": 0.7046, "step": 1902 }, { "epoch": 0.3839027634619082, "grad_norm": 0.5020780563354492, "learning_rate": 7.059224568081381e-06, "loss": 0.8056, "step": 1903 }, { "epoch": 0.3841044990181152, "grad_norm": 0.7355788350105286, "learning_rate": 7.056246399457019e-06, "loss": 0.7505, "step": 1904 }, { "epoch": 0.3843062345743222, "grad_norm": 0.8149549961090088, "learning_rate": 7.053267352565504e-06, "loss": 0.659, "step": 1905 }, { "epoch": 0.38450797013052923, "grad_norm": 0.37660858035087585, "learning_rate": 7.05028742867925e-06, "loss": 0.723, "step": 1906 }, { "epoch": 0.3847097056867362, "grad_norm": 0.5093876123428345, "learning_rate": 7.047306629071048e-06, "loss": 0.6905, "step": 1907 }, { "epoch": 0.3849114412429432, "grad_norm": 0.47235724329948425, "learning_rate": 7.044324955014062e-06, "loss": 0.6641, "step": 1908 }, { "epoch": 0.3851131767991502, "grad_norm": 0.5246691107749939, "learning_rate": 7.04134240778183e-06, "loss": 0.6654, "step": 1909 }, { "epoch": 0.3853149123553572, "grad_norm": 0.41439288854599, "learning_rate": 7.03835898864826e-06, "loss": 0.6438, "step": 1910 }, { "epoch": 0.3855166479115642, "grad_norm": 0.47759121656417847, "learning_rate": 7.0353746988876345e-06, "loss": 0.698, "step": 1911 }, { "epoch": 0.38571838346777115, "grad_norm": 0.5373578071594238, "learning_rate": 7.032389539774611e-06, "loss": 0.7928, "step": 1912 }, { "epoch": 0.38592011902397816, "grad_norm": 0.38632825016975403, "learning_rate": 7.029403512584214e-06, "loss": 0.6464, "step": 1913 }, { "epoch": 0.38612185458018516, "grad_norm": 0.743184506893158, "learning_rate": 7.026416618591838e-06, "loss": 0.697, "step": 1914 }, { "epoch": 0.38632359013639217, "grad_norm": 0.36446547508239746, "learning_rate": 7.0234288590732516e-06, "loss": 0.648, "step": 1915 }, { "epoch": 0.3865253256925991, "grad_norm": 0.32801979780197144, "learning_rate": 7.020440235304593e-06, "loss": 0.6475, "step": 1916 }, { "epoch": 0.3867270612488061, "grad_norm": 0.48668283224105835, "learning_rate": 7.017450748562364e-06, "loss": 0.9108, "step": 1917 }, { "epoch": 0.38692879680501313, "grad_norm": 0.5561202764511108, "learning_rate": 7.0144604001234405e-06, "loss": 0.6994, "step": 1918 }, { "epoch": 0.38713053236122014, "grad_norm": 0.3585546016693115, "learning_rate": 7.011469191265066e-06, "loss": 0.7193, "step": 1919 }, { "epoch": 0.38733226791742714, "grad_norm": 0.627954363822937, "learning_rate": 7.008477123264849e-06, "loss": 0.6728, "step": 1920 }, { "epoch": 0.3875340034736341, "grad_norm": 0.47432732582092285, "learning_rate": 7.005484197400765e-06, "loss": 0.6372, "step": 1921 }, { "epoch": 0.3877357390298411, "grad_norm": 0.6232267618179321, "learning_rate": 7.00249041495116e-06, "loss": 0.8147, "step": 1922 }, { "epoch": 0.3879374745860481, "grad_norm": 0.5535650849342346, "learning_rate": 6.99949577719474e-06, "loss": 0.6625, "step": 1923 }, { "epoch": 0.3881392101422551, "grad_norm": 0.5244125127792358, "learning_rate": 6.996500285410582e-06, "loss": 0.7226, "step": 1924 }, { "epoch": 0.38834094569846206, "grad_norm": 0.8325016498565674, "learning_rate": 6.993503940878126e-06, "loss": 0.7708, "step": 1925 }, { "epoch": 0.38854268125466906, "grad_norm": 0.4260006546974182, "learning_rate": 6.990506744877171e-06, "loss": 0.7013, "step": 1926 }, { "epoch": 0.38874441681087607, "grad_norm": 0.3316943049430847, "learning_rate": 6.987508698687886e-06, "loss": 0.6514, "step": 1927 }, { "epoch": 0.3889461523670831, "grad_norm": 0.31761300563812256, "learning_rate": 6.984509803590802e-06, "loss": 0.6922, "step": 1928 }, { "epoch": 0.3891478879232901, "grad_norm": 0.5712217092514038, "learning_rate": 6.981510060866812e-06, "loss": 0.7326, "step": 1929 }, { "epoch": 0.38934962347949703, "grad_norm": 0.4465492367744446, "learning_rate": 6.97850947179717e-06, "loss": 0.9044, "step": 1930 }, { "epoch": 0.38955135903570404, "grad_norm": 0.350298136472702, "learning_rate": 6.97550803766349e-06, "loss": 0.6799, "step": 1931 }, { "epoch": 0.38975309459191104, "grad_norm": 0.4485641121864319, "learning_rate": 6.972505759747754e-06, "loss": 0.8262, "step": 1932 }, { "epoch": 0.38995483014811805, "grad_norm": 0.5809760093688965, "learning_rate": 6.969502639332298e-06, "loss": 0.6894, "step": 1933 }, { "epoch": 0.390156565704325, "grad_norm": 0.3777012825012207, "learning_rate": 6.9664986776998155e-06, "loss": 0.6713, "step": 1934 }, { "epoch": 0.390358301260532, "grad_norm": 0.4996185898780823, "learning_rate": 6.963493876133367e-06, "loss": 0.6522, "step": 1935 }, { "epoch": 0.390560036816739, "grad_norm": 0.7318883538246155, "learning_rate": 6.960488235916367e-06, "loss": 0.7009, "step": 1936 }, { "epoch": 0.390761772372946, "grad_norm": 0.46615687012672424, "learning_rate": 6.957481758332592e-06, "loss": 0.8314, "step": 1937 }, { "epoch": 0.390963507929153, "grad_norm": 0.3986649811267853, "learning_rate": 6.954474444666169e-06, "loss": 0.6622, "step": 1938 }, { "epoch": 0.39116524348535997, "grad_norm": 0.3693607747554779, "learning_rate": 6.951466296201587e-06, "loss": 0.6999, "step": 1939 }, { "epoch": 0.391366979041567, "grad_norm": 0.5240568518638611, "learning_rate": 6.948457314223693e-06, "loss": 0.8285, "step": 1940 }, { "epoch": 0.391568714597774, "grad_norm": 0.5109261870384216, "learning_rate": 6.945447500017689e-06, "loss": 0.6924, "step": 1941 }, { "epoch": 0.391770450153981, "grad_norm": 0.5352274179458618, "learning_rate": 6.942436854869129e-06, "loss": 0.6813, "step": 1942 }, { "epoch": 0.39197218571018794, "grad_norm": 0.6227556467056274, "learning_rate": 6.939425380063924e-06, "loss": 0.6969, "step": 1943 }, { "epoch": 0.39217392126639494, "grad_norm": 0.7124168276786804, "learning_rate": 6.936413076888344e-06, "loss": 0.6694, "step": 1944 }, { "epoch": 0.39237565682260195, "grad_norm": 0.44350770115852356, "learning_rate": 6.933399946629005e-06, "loss": 0.653, "step": 1945 }, { "epoch": 0.39257739237880895, "grad_norm": 0.48074135184288025, "learning_rate": 6.930385990572879e-06, "loss": 0.6888, "step": 1946 }, { "epoch": 0.39277912793501596, "grad_norm": 0.400717169046402, "learning_rate": 6.927371210007293e-06, "loss": 0.7551, "step": 1947 }, { "epoch": 0.3929808634912229, "grad_norm": 0.5485891699790955, "learning_rate": 6.924355606219927e-06, "loss": 0.7881, "step": 1948 }, { "epoch": 0.3931825990474299, "grad_norm": 0.37272897362709045, "learning_rate": 6.921339180498807e-06, "loss": 0.6898, "step": 1949 }, { "epoch": 0.3933843346036369, "grad_norm": 0.5032504796981812, "learning_rate": 6.918321934132315e-06, "loss": 0.7051, "step": 1950 }, { "epoch": 0.3935860701598439, "grad_norm": 0.3550852835178375, "learning_rate": 6.915303868409182e-06, "loss": 0.6538, "step": 1951 }, { "epoch": 0.3937878057160509, "grad_norm": 0.7708501219749451, "learning_rate": 6.9122849846184895e-06, "loss": 0.649, "step": 1952 }, { "epoch": 0.3939895412722579, "grad_norm": 0.878424882888794, "learning_rate": 6.909265284049664e-06, "loss": 0.6987, "step": 1953 }, { "epoch": 0.3941912768284649, "grad_norm": 0.4390408992767334, "learning_rate": 6.90624476799249e-06, "loss": 0.7713, "step": 1954 }, { "epoch": 0.3943930123846719, "grad_norm": 0.7288705110549927, "learning_rate": 6.903223437737092e-06, "loss": 0.6889, "step": 1955 }, { "epoch": 0.3945947479408789, "grad_norm": 1.3917056322097778, "learning_rate": 6.900201294573946e-06, "loss": 0.9714, "step": 1956 }, { "epoch": 0.39479648349708585, "grad_norm": 0.41537991166114807, "learning_rate": 6.897178339793875e-06, "loss": 0.6585, "step": 1957 }, { "epoch": 0.39499821905329285, "grad_norm": 1.4110198020935059, "learning_rate": 6.894154574688046e-06, "loss": 0.7028, "step": 1958 }, { "epoch": 0.39519995460949986, "grad_norm": 0.4615449607372284, "learning_rate": 6.891130000547979e-06, "loss": 0.6434, "step": 1959 }, { "epoch": 0.39540169016570686, "grad_norm": 0.5821374654769897, "learning_rate": 6.888104618665529e-06, "loss": 0.6351, "step": 1960 }, { "epoch": 0.3956034257219138, "grad_norm": 0.7208396792411804, "learning_rate": 6.885078430332905e-06, "loss": 0.7269, "step": 1961 }, { "epoch": 0.3958051612781208, "grad_norm": 0.544691801071167, "learning_rate": 6.8820514368426565e-06, "loss": 0.6914, "step": 1962 }, { "epoch": 0.3960068968343278, "grad_norm": 0.4440980553627014, "learning_rate": 6.879023639487676e-06, "loss": 0.7014, "step": 1963 }, { "epoch": 0.39620863239053483, "grad_norm": 0.6505277156829834, "learning_rate": 6.875995039561206e-06, "loss": 0.673, "step": 1964 }, { "epoch": 0.39641036794674184, "grad_norm": 0.7644113898277283, "learning_rate": 6.872965638356823e-06, "loss": 0.7049, "step": 1965 }, { "epoch": 0.3966121035029488, "grad_norm": 0.6489436626434326, "learning_rate": 6.869935437168449e-06, "loss": 0.6419, "step": 1966 }, { "epoch": 0.3968138390591558, "grad_norm": 0.7834652662277222, "learning_rate": 6.8669044372903495e-06, "loss": 0.6944, "step": 1967 }, { "epoch": 0.3970155746153628, "grad_norm": 0.35680845379829407, "learning_rate": 6.86387264001713e-06, "loss": 0.6467, "step": 1968 }, { "epoch": 0.3972173101715698, "grad_norm": 0.9005623459815979, "learning_rate": 6.860840046643736e-06, "loss": 0.7223, "step": 1969 }, { "epoch": 0.39741904572777675, "grad_norm": 0.5492415428161621, "learning_rate": 6.857806658465453e-06, "loss": 0.6834, "step": 1970 }, { "epoch": 0.39762078128398376, "grad_norm": 0.3760004937648773, "learning_rate": 6.854772476777909e-06, "loss": 0.6643, "step": 1971 }, { "epoch": 0.39782251684019077, "grad_norm": 0.36829209327697754, "learning_rate": 6.851737502877066e-06, "loss": 0.6687, "step": 1972 }, { "epoch": 0.39802425239639777, "grad_norm": 0.4438265264034271, "learning_rate": 6.8487017380592266e-06, "loss": 0.7908, "step": 1973 }, { "epoch": 0.3982259879526048, "grad_norm": 0.5089324712753296, "learning_rate": 6.845665183621033e-06, "loss": 0.7331, "step": 1974 }, { "epoch": 0.3984277235088117, "grad_norm": 0.7725141644477844, "learning_rate": 6.842627840859461e-06, "loss": 1.1026, "step": 1975 }, { "epoch": 0.39862945906501873, "grad_norm": 0.39934641122817993, "learning_rate": 6.839589711071828e-06, "loss": 0.7376, "step": 1976 }, { "epoch": 0.39883119462122574, "grad_norm": 0.6319762468338013, "learning_rate": 6.836550795555781e-06, "loss": 1.0085, "step": 1977 }, { "epoch": 0.39903293017743274, "grad_norm": 0.33538317680358887, "learning_rate": 6.833511095609309e-06, "loss": 0.6938, "step": 1978 }, { "epoch": 0.3992346657336397, "grad_norm": 0.6401187777519226, "learning_rate": 6.830470612530733e-06, "loss": 0.7073, "step": 1979 }, { "epoch": 0.3994364012898467, "grad_norm": 0.5242562890052795, "learning_rate": 6.827429347618709e-06, "loss": 0.7513, "step": 1980 }, { "epoch": 0.3996381368460537, "grad_norm": 1.2201377153396606, "learning_rate": 6.824387302172225e-06, "loss": 0.6687, "step": 1981 }, { "epoch": 0.3998398724022607, "grad_norm": 0.39438146352767944, "learning_rate": 6.821344477490605e-06, "loss": 0.6536, "step": 1982 }, { "epoch": 0.4000416079584677, "grad_norm": 0.4241214692592621, "learning_rate": 6.818300874873508e-06, "loss": 0.8196, "step": 1983 }, { "epoch": 0.40024334351467467, "grad_norm": 1.178167462348938, "learning_rate": 6.815256495620919e-06, "loss": 0.6902, "step": 1984 }, { "epoch": 0.40044507907088167, "grad_norm": 0.3699394464492798, "learning_rate": 6.812211341033158e-06, "loss": 0.6524, "step": 1985 }, { "epoch": 0.4006468146270887, "grad_norm": 1.5307899713516235, "learning_rate": 6.8091654124108765e-06, "loss": 0.7088, "step": 1986 }, { "epoch": 0.4008485501832957, "grad_norm": 0.925547182559967, "learning_rate": 6.8061187110550586e-06, "loss": 0.701, "step": 1987 }, { "epoch": 0.40105028573950263, "grad_norm": 0.5421550869941711, "learning_rate": 6.803071238267011e-06, "loss": 0.6445, "step": 1988 }, { "epoch": 0.40125202129570964, "grad_norm": 0.5064270496368408, "learning_rate": 6.800022995348381e-06, "loss": 0.7091, "step": 1989 }, { "epoch": 0.40145375685191664, "grad_norm": 0.4140400290489197, "learning_rate": 6.796973983601135e-06, "loss": 0.6602, "step": 1990 }, { "epoch": 0.40165549240812365, "grad_norm": 0.49159950017929077, "learning_rate": 6.793924204327572e-06, "loss": 0.6981, "step": 1991 }, { "epoch": 0.40185722796433065, "grad_norm": 0.3385809361934662, "learning_rate": 6.790873658830321e-06, "loss": 0.6857, "step": 1992 }, { "epoch": 0.4020589635205376, "grad_norm": 2.379497528076172, "learning_rate": 6.787822348412333e-06, "loss": 0.7229, "step": 1993 }, { "epoch": 0.4022606990767446, "grad_norm": 0.310107946395874, "learning_rate": 6.784770274376888e-06, "loss": 0.7501, "step": 1994 }, { "epoch": 0.4024624346329516, "grad_norm": 0.503961980342865, "learning_rate": 6.781717438027594e-06, "loss": 0.6566, "step": 1995 }, { "epoch": 0.4026641701891586, "grad_norm": 0.3085574805736542, "learning_rate": 6.7786638406683845e-06, "loss": 0.6601, "step": 1996 }, { "epoch": 0.40286590574536557, "grad_norm": 0.6734681129455566, "learning_rate": 6.775609483603516e-06, "loss": 0.7521, "step": 1997 }, { "epoch": 0.4030676413015726, "grad_norm": 0.4044412076473236, "learning_rate": 6.772554368137567e-06, "loss": 0.8514, "step": 1998 }, { "epoch": 0.4032693768577796, "grad_norm": 0.7736945152282715, "learning_rate": 6.7694984955754465e-06, "loss": 0.665, "step": 1999 }, { "epoch": 0.4034711124139866, "grad_norm": 0.5825895667076111, "learning_rate": 6.766441867222384e-06, "loss": 0.705, "step": 2000 }, { "epoch": 0.4036728479701936, "grad_norm": 5.248704433441162, "learning_rate": 6.763384484383929e-06, "loss": 0.6642, "step": 2001 }, { "epoch": 0.40387458352640054, "grad_norm": 0.5176589488983154, "learning_rate": 6.760326348365955e-06, "loss": 0.6548, "step": 2002 }, { "epoch": 0.40407631908260755, "grad_norm": 0.41134941577911377, "learning_rate": 6.757267460474663e-06, "loss": 0.6557, "step": 2003 }, { "epoch": 0.40427805463881455, "grad_norm": 3.7623648643493652, "learning_rate": 6.754207822016565e-06, "loss": 0.7574, "step": 2004 }, { "epoch": 0.40447979019502156, "grad_norm": 1.6524170637130737, "learning_rate": 6.7511474342985e-06, "loss": 0.6833, "step": 2005 }, { "epoch": 0.40468152575122857, "grad_norm": 1.2042206525802612, "learning_rate": 6.748086298627624e-06, "loss": 0.6984, "step": 2006 }, { "epoch": 0.4048832613074355, "grad_norm": 0.7427788376808167, "learning_rate": 6.745024416311418e-06, "loss": 0.6826, "step": 2007 }, { "epoch": 0.4050849968636425, "grad_norm": 0.5733217597007751, "learning_rate": 6.7419617886576735e-06, "loss": 0.6935, "step": 2008 }, { "epoch": 0.4052867324198495, "grad_norm": 0.3138233423233032, "learning_rate": 6.738898416974507e-06, "loss": 0.7866, "step": 2009 }, { "epoch": 0.40548846797605653, "grad_norm": 2.404067277908325, "learning_rate": 6.7358343025703506e-06, "loss": 0.6897, "step": 2010 }, { "epoch": 0.4056902035322635, "grad_norm": 0.4286998212337494, "learning_rate": 6.732769446753954e-06, "loss": 0.7369, "step": 2011 }, { "epoch": 0.4058919390884705, "grad_norm": 0.8496800661087036, "learning_rate": 6.729703850834381e-06, "loss": 0.7423, "step": 2012 }, { "epoch": 0.4060936746446775, "grad_norm": 0.5670628547668457, "learning_rate": 6.7266375161210175e-06, "loss": 0.7706, "step": 2013 }, { "epoch": 0.4062954102008845, "grad_norm": 0.5016106367111206, "learning_rate": 6.723570443923557e-06, "loss": 0.6655, "step": 2014 }, { "epoch": 0.4064971457570915, "grad_norm": 0.4219368100166321, "learning_rate": 6.7205026355520145e-06, "loss": 0.7931, "step": 2015 }, { "epoch": 0.40669888131329845, "grad_norm": 0.5831752419471741, "learning_rate": 6.717434092316716e-06, "loss": 0.753, "step": 2016 }, { "epoch": 0.40690061686950546, "grad_norm": 0.5141821503639221, "learning_rate": 6.7143648155283025e-06, "loss": 0.7004, "step": 2017 }, { "epoch": 0.40710235242571247, "grad_norm": 0.43418243527412415, "learning_rate": 6.71129480649773e-06, "loss": 0.684, "step": 2018 }, { "epoch": 0.40730408798191947, "grad_norm": 0.3431943356990814, "learning_rate": 6.708224066536263e-06, "loss": 0.7785, "step": 2019 }, { "epoch": 0.4075058235381264, "grad_norm": 0.40281733870506287, "learning_rate": 6.705152596955483e-06, "loss": 0.6922, "step": 2020 }, { "epoch": 0.4077075590943334, "grad_norm": 0.6138989925384521, "learning_rate": 6.70208039906728e-06, "loss": 0.6413, "step": 2021 }, { "epoch": 0.40790929465054043, "grad_norm": 0.8132925033569336, "learning_rate": 6.699007474183854e-06, "loss": 0.8472, "step": 2022 }, { "epoch": 0.40811103020674744, "grad_norm": 0.5439250469207764, "learning_rate": 6.695933823617719e-06, "loss": 0.7646, "step": 2023 }, { "epoch": 0.40831276576295444, "grad_norm": 0.48923900723457336, "learning_rate": 6.6928594486817e-06, "loss": 0.6508, "step": 2024 }, { "epoch": 0.4085145013191614, "grad_norm": 0.6131086945533752, "learning_rate": 6.689784350688926e-06, "loss": 0.6921, "step": 2025 }, { "epoch": 0.4087162368753684, "grad_norm": 0.44216862320899963, "learning_rate": 6.686708530952836e-06, "loss": 0.7758, "step": 2026 }, { "epoch": 0.4089179724315754, "grad_norm": 0.4120861291885376, "learning_rate": 6.6836319907871825e-06, "loss": 1.0358, "step": 2027 }, { "epoch": 0.4091197079877824, "grad_norm": 0.6558131575584412, "learning_rate": 6.68055473150602e-06, "loss": 0.7933, "step": 2028 }, { "epoch": 0.40932144354398936, "grad_norm": 1.0560377836227417, "learning_rate": 6.677476754423714e-06, "loss": 1.1087, "step": 2029 }, { "epoch": 0.40952317910019637, "grad_norm": 0.3615643382072449, "learning_rate": 6.674398060854931e-06, "loss": 0.7684, "step": 2030 }, { "epoch": 0.40972491465640337, "grad_norm": 0.636316180229187, "learning_rate": 6.671318652114652e-06, "loss": 0.6682, "step": 2031 }, { "epoch": 0.4099266502126104, "grad_norm": 1.1835805177688599, "learning_rate": 6.668238529518157e-06, "loss": 0.7633, "step": 2032 }, { "epoch": 0.4101283857688174, "grad_norm": 0.6155040860176086, "learning_rate": 6.66515769438103e-06, "loss": 0.6595, "step": 2033 }, { "epoch": 0.41033012132502433, "grad_norm": 0.5014193058013916, "learning_rate": 6.662076148019168e-06, "loss": 0.7761, "step": 2034 }, { "epoch": 0.41053185688123134, "grad_norm": 0.4848555624485016, "learning_rate": 6.65899389174876e-06, "loss": 0.6308, "step": 2035 }, { "epoch": 0.41073359243743834, "grad_norm": 0.44919511675834656, "learning_rate": 6.655910926886308e-06, "loss": 0.6908, "step": 2036 }, { "epoch": 0.41093532799364535, "grad_norm": 1.0742298364639282, "learning_rate": 6.65282725474861e-06, "loss": 0.7018, "step": 2037 }, { "epoch": 0.4111370635498523, "grad_norm": 0.46099919080734253, "learning_rate": 6.649742876652772e-06, "loss": 0.8293, "step": 2038 }, { "epoch": 0.4113387991060593, "grad_norm": 0.6284064650535583, "learning_rate": 6.646657793916196e-06, "loss": 0.679, "step": 2039 }, { "epoch": 0.4115405346622663, "grad_norm": 0.4924802780151367, "learning_rate": 6.643572007856587e-06, "loss": 0.8707, "step": 2040 }, { "epoch": 0.4117422702184733, "grad_norm": 0.4875141978263855, "learning_rate": 6.640485519791953e-06, "loss": 0.6645, "step": 2041 }, { "epoch": 0.4119440057746803, "grad_norm": 1.0110448598861694, "learning_rate": 6.637398331040597e-06, "loss": 0.7211, "step": 2042 }, { "epoch": 0.41214574133088727, "grad_norm": 0.8589099049568176, "learning_rate": 6.634310442921124e-06, "loss": 0.6869, "step": 2043 }, { "epoch": 0.4123474768870943, "grad_norm": 0.5140877962112427, "learning_rate": 6.63122185675244e-06, "loss": 0.624, "step": 2044 }, { "epoch": 0.4125492124433013, "grad_norm": 0.43134400248527527, "learning_rate": 6.628132573853745e-06, "loss": 0.6992, "step": 2045 }, { "epoch": 0.4127509479995083, "grad_norm": 0.6019126772880554, "learning_rate": 6.6250425955445386e-06, "loss": 0.7856, "step": 2046 }, { "epoch": 0.41295268355571524, "grad_norm": 1.1939761638641357, "learning_rate": 6.621951923144616e-06, "loss": 0.8442, "step": 2047 }, { "epoch": 0.41315441911192224, "grad_norm": 0.8242788314819336, "learning_rate": 6.618860557974073e-06, "loss": 0.6458, "step": 2048 }, { "epoch": 0.41335615466812925, "grad_norm": 0.6458016037940979, "learning_rate": 6.615768501353297e-06, "loss": 0.7739, "step": 2049 }, { "epoch": 0.41355789022433626, "grad_norm": 0.5104942917823792, "learning_rate": 6.612675754602968e-06, "loss": 0.7799, "step": 2050 }, { "epoch": 0.41375962578054326, "grad_norm": 0.6379034519195557, "learning_rate": 6.60958231904407e-06, "loss": 0.7387, "step": 2051 }, { "epoch": 0.4139613613367502, "grad_norm": 1.0765997171401978, "learning_rate": 6.606488195997876e-06, "loss": 0.681, "step": 2052 }, { "epoch": 0.4141630968929572, "grad_norm": 0.9007116556167603, "learning_rate": 6.603393386785948e-06, "loss": 0.7555, "step": 2053 }, { "epoch": 0.4143648324491642, "grad_norm": 0.48005685210227966, "learning_rate": 6.60029789273015e-06, "loss": 0.6721, "step": 2054 }, { "epoch": 0.4145665680053712, "grad_norm": 0.7423443794250488, "learning_rate": 6.5972017151526325e-06, "loss": 1.1185, "step": 2055 }, { "epoch": 0.4147683035615782, "grad_norm": 0.4530531167984009, "learning_rate": 6.59410485537584e-06, "loss": 1.0106, "step": 2056 }, { "epoch": 0.4149700391177852, "grad_norm": 0.42010125517845154, "learning_rate": 6.591007314722508e-06, "loss": 0.7337, "step": 2057 }, { "epoch": 0.4151717746739922, "grad_norm": 0.4540041387081146, "learning_rate": 6.587909094515663e-06, "loss": 0.6685, "step": 2058 }, { "epoch": 0.4153735102301992, "grad_norm": 0.6442490816116333, "learning_rate": 6.584810196078622e-06, "loss": 0.6358, "step": 2059 }, { "epoch": 0.4155752457864062, "grad_norm": 0.42252811789512634, "learning_rate": 6.58171062073499e-06, "loss": 0.6725, "step": 2060 }, { "epoch": 0.41577698134261315, "grad_norm": 0.7584266662597656, "learning_rate": 6.578610369808663e-06, "loss": 0.7934, "step": 2061 }, { "epoch": 0.41597871689882016, "grad_norm": 0.5483881831169128, "learning_rate": 6.575509444623825e-06, "loss": 0.7312, "step": 2062 }, { "epoch": 0.41618045245502716, "grad_norm": 0.4764006733894348, "learning_rate": 6.57240784650495e-06, "loss": 0.7083, "step": 2063 }, { "epoch": 0.41638218801123417, "grad_norm": 0.8024596571922302, "learning_rate": 6.569305576776794e-06, "loss": 0.832, "step": 2064 }, { "epoch": 0.4165839235674411, "grad_norm": 0.5202115178108215, "learning_rate": 6.566202636764406e-06, "loss": 0.6965, "step": 2065 }, { "epoch": 0.4167856591236481, "grad_norm": 0.8023049831390381, "learning_rate": 6.563099027793116e-06, "loss": 0.6656, "step": 2066 }, { "epoch": 0.41698739467985513, "grad_norm": 0.3753010928630829, "learning_rate": 6.559994751188545e-06, "loss": 0.688, "step": 2067 }, { "epoch": 0.41718913023606213, "grad_norm": 0.6880113482475281, "learning_rate": 6.5568898082765945e-06, "loss": 0.6656, "step": 2068 }, { "epoch": 0.41739086579226914, "grad_norm": 0.64495450258255, "learning_rate": 6.553784200383453e-06, "loss": 0.6485, "step": 2069 }, { "epoch": 0.4175926013484761, "grad_norm": 0.7110615968704224, "learning_rate": 6.550677928835592e-06, "loss": 0.7502, "step": 2070 }, { "epoch": 0.4177943369046831, "grad_norm": 0.4838223457336426, "learning_rate": 6.54757099495977e-06, "loss": 0.6866, "step": 2071 }, { "epoch": 0.4179960724608901, "grad_norm": 0.9055322408676147, "learning_rate": 6.544463400083021e-06, "loss": 0.6705, "step": 2072 }, { "epoch": 0.4181978080170971, "grad_norm": 0.38350167870521545, "learning_rate": 6.541355145532669e-06, "loss": 0.6856, "step": 2073 }, { "epoch": 0.41839954357330406, "grad_norm": 0.6029995083808899, "learning_rate": 6.538246232636316e-06, "loss": 0.6904, "step": 2074 }, { "epoch": 0.41860127912951106, "grad_norm": 0.36843833327293396, "learning_rate": 6.535136662721844e-06, "loss": 0.7428, "step": 2075 }, { "epoch": 0.41880301468571807, "grad_norm": 0.7784242630004883, "learning_rate": 6.5320264371174195e-06, "loss": 0.7155, "step": 2076 }, { "epoch": 0.4190047502419251, "grad_norm": 0.38750869035720825, "learning_rate": 6.528915557151484e-06, "loss": 0.708, "step": 2077 }, { "epoch": 0.4192064857981321, "grad_norm": 0.9216073155403137, "learning_rate": 6.525804024152765e-06, "loss": 0.8106, "step": 2078 }, { "epoch": 0.41940822135433903, "grad_norm": 0.7102227807044983, "learning_rate": 6.522691839450262e-06, "loss": 0.7246, "step": 2079 }, { "epoch": 0.41960995691054603, "grad_norm": 0.6181046962738037, "learning_rate": 6.51957900437326e-06, "loss": 0.6921, "step": 2080 }, { "epoch": 0.41981169246675304, "grad_norm": 0.432643324136734, "learning_rate": 6.5164655202513135e-06, "loss": 0.6676, "step": 2081 }, { "epoch": 0.42001342802296004, "grad_norm": 0.37533438205718994, "learning_rate": 6.5133513884142605e-06, "loss": 0.6286, "step": 2082 }, { "epoch": 0.420215163579167, "grad_norm": 0.7802932858467102, "learning_rate": 6.510236610192215e-06, "loss": 0.6887, "step": 2083 }, { "epoch": 0.420416899135374, "grad_norm": 0.3948320746421814, "learning_rate": 6.507121186915567e-06, "loss": 0.6774, "step": 2084 }, { "epoch": 0.420618634691581, "grad_norm": 0.3722754418849945, "learning_rate": 6.5040051199149755e-06, "loss": 0.8555, "step": 2085 }, { "epoch": 0.420820370247788, "grad_norm": 0.4183351695537567, "learning_rate": 6.500888410521385e-06, "loss": 0.8037, "step": 2086 }, { "epoch": 0.421022105803995, "grad_norm": 0.48188161849975586, "learning_rate": 6.497771060066008e-06, "loss": 0.7292, "step": 2087 }, { "epoch": 0.42122384136020197, "grad_norm": 1.078413724899292, "learning_rate": 6.494653069880332e-06, "loss": 0.7182, "step": 2088 }, { "epoch": 0.421425576916409, "grad_norm": 0.39380356669425964, "learning_rate": 6.491534441296117e-06, "loss": 0.6689, "step": 2089 }, { "epoch": 0.421627312472616, "grad_norm": 0.39865002036094666, "learning_rate": 6.488415175645395e-06, "loss": 0.6478, "step": 2090 }, { "epoch": 0.421829048028823, "grad_norm": 0.5632877945899963, "learning_rate": 6.485295274260476e-06, "loss": 0.8674, "step": 2091 }, { "epoch": 0.42203078358502993, "grad_norm": 0.2843717336654663, "learning_rate": 6.4821747384739344e-06, "loss": 0.653, "step": 2092 }, { "epoch": 0.42223251914123694, "grad_norm": 0.4226774573326111, "learning_rate": 6.479053569618616e-06, "loss": 0.7398, "step": 2093 }, { "epoch": 0.42243425469744394, "grad_norm": 0.3823719918727875, "learning_rate": 6.475931769027643e-06, "loss": 0.6444, "step": 2094 }, { "epoch": 0.42263599025365095, "grad_norm": 1.0371155738830566, "learning_rate": 6.472809338034405e-06, "loss": 0.6676, "step": 2095 }, { "epoch": 0.42283772580985796, "grad_norm": 0.5268231630325317, "learning_rate": 6.469686277972556e-06, "loss": 0.749, "step": 2096 }, { "epoch": 0.4230394613660649, "grad_norm": 0.35234931111335754, "learning_rate": 6.466562590176021e-06, "loss": 0.6727, "step": 2097 }, { "epoch": 0.4232411969222719, "grad_norm": 4.1712775230407715, "learning_rate": 6.463438275978998e-06, "loss": 0.7347, "step": 2098 }, { "epoch": 0.4234429324784789, "grad_norm": 0.4240151047706604, "learning_rate": 6.4603133367159486e-06, "loss": 0.8569, "step": 2099 }, { "epoch": 0.4236446680346859, "grad_norm": 0.40674498677253723, "learning_rate": 6.4571877737216e-06, "loss": 0.66, "step": 2100 }, { "epoch": 0.4238464035908929, "grad_norm": 0.37719449400901794, "learning_rate": 6.454061588330947e-06, "loss": 0.6394, "step": 2101 }, { "epoch": 0.4240481391470999, "grad_norm": 0.6937292218208313, "learning_rate": 6.450934781879254e-06, "loss": 0.6966, "step": 2102 }, { "epoch": 0.4242498747033069, "grad_norm": 0.6843068599700928, "learning_rate": 6.447807355702047e-06, "loss": 0.7033, "step": 2103 }, { "epoch": 0.4244516102595139, "grad_norm": 0.4702962338924408, "learning_rate": 6.444679311135112e-06, "loss": 0.8195, "step": 2104 }, { "epoch": 0.4246533458157209, "grad_norm": 1.1637814044952393, "learning_rate": 6.441550649514509e-06, "loss": 0.6992, "step": 2105 }, { "epoch": 0.42485508137192785, "grad_norm": 0.48818790912628174, "learning_rate": 6.4384213721765565e-06, "loss": 0.7211, "step": 2106 }, { "epoch": 0.42505681692813485, "grad_norm": 0.47552841901779175, "learning_rate": 6.4352914804578345e-06, "loss": 0.8105, "step": 2107 }, { "epoch": 0.42525855248434186, "grad_norm": 0.4511565566062927, "learning_rate": 6.43216097569519e-06, "loss": 0.7177, "step": 2108 }, { "epoch": 0.42546028804054886, "grad_norm": 0.38846555352211, "learning_rate": 6.429029859225725e-06, "loss": 0.6874, "step": 2109 }, { "epoch": 0.42566202359675587, "grad_norm": 0.5311095714569092, "learning_rate": 6.42589813238681e-06, "loss": 0.8165, "step": 2110 }, { "epoch": 0.4258637591529628, "grad_norm": 1.02147376537323, "learning_rate": 6.422765796516071e-06, "loss": 0.7223, "step": 2111 }, { "epoch": 0.4260654947091698, "grad_norm": 0.3974895477294922, "learning_rate": 6.419632852951398e-06, "loss": 0.8251, "step": 2112 }, { "epoch": 0.42626723026537683, "grad_norm": 0.6259453296661377, "learning_rate": 6.416499303030939e-06, "loss": 0.9337, "step": 2113 }, { "epoch": 0.42646896582158383, "grad_norm": 0.5818927884101868, "learning_rate": 6.413365148093097e-06, "loss": 0.6606, "step": 2114 }, { "epoch": 0.4266707013777908, "grad_norm": 2.3462257385253906, "learning_rate": 6.410230389476542e-06, "loss": 0.6489, "step": 2115 }, { "epoch": 0.4268724369339978, "grad_norm": 1.1948621273040771, "learning_rate": 6.407095028520194e-06, "loss": 0.8109, "step": 2116 }, { "epoch": 0.4270741724902048, "grad_norm": 1.4963581562042236, "learning_rate": 6.403959066563234e-06, "loss": 0.7941, "step": 2117 }, { "epoch": 0.4272759080464118, "grad_norm": 0.8830330967903137, "learning_rate": 6.4008225049450974e-06, "loss": 0.8084, "step": 2118 }, { "epoch": 0.4274776436026188, "grad_norm": 0.3570726811885834, "learning_rate": 6.397685345005482e-06, "loss": 0.6967, "step": 2119 }, { "epoch": 0.42767937915882576, "grad_norm": 0.5791197419166565, "learning_rate": 6.394547588084331e-06, "loss": 0.8182, "step": 2120 }, { "epoch": 0.42788111471503276, "grad_norm": 0.39418864250183105, "learning_rate": 6.3914092355218494e-06, "loss": 0.7964, "step": 2121 }, { "epoch": 0.42808285027123977, "grad_norm": 0.8419885039329529, "learning_rate": 6.388270288658498e-06, "loss": 0.7087, "step": 2122 }, { "epoch": 0.4282845858274468, "grad_norm": 0.9715898036956787, "learning_rate": 6.385130748834986e-06, "loss": 0.7973, "step": 2123 }, { "epoch": 0.4284863213836537, "grad_norm": 0.5298724174499512, "learning_rate": 6.38199061739228e-06, "loss": 0.6844, "step": 2124 }, { "epoch": 0.42868805693986073, "grad_norm": 0.4409395754337311, "learning_rate": 6.378849895671594e-06, "loss": 0.6847, "step": 2125 }, { "epoch": 0.42888979249606773, "grad_norm": 0.6608665585517883, "learning_rate": 6.375708585014403e-06, "loss": 0.6416, "step": 2126 }, { "epoch": 0.42909152805227474, "grad_norm": 0.35339251160621643, "learning_rate": 6.372566686762427e-06, "loss": 0.6711, "step": 2127 }, { "epoch": 0.42929326360848175, "grad_norm": 0.3852989375591278, "learning_rate": 6.369424202257637e-06, "loss": 0.6684, "step": 2128 }, { "epoch": 0.4294949991646887, "grad_norm": 0.428529292345047, "learning_rate": 6.366281132842256e-06, "loss": 0.8359, "step": 2129 }, { "epoch": 0.4296967347208957, "grad_norm": 0.45827290415763855, "learning_rate": 6.363137479858759e-06, "loss": 0.8202, "step": 2130 }, { "epoch": 0.4298984702771027, "grad_norm": 0.3035009503364563, "learning_rate": 6.359993244649865e-06, "loss": 0.6537, "step": 2131 }, { "epoch": 0.4301002058333097, "grad_norm": 0.39818131923675537, "learning_rate": 6.356848428558546e-06, "loss": 0.6618, "step": 2132 }, { "epoch": 0.43030194138951666, "grad_norm": 0.47690775990486145, "learning_rate": 6.35370303292802e-06, "loss": 0.6937, "step": 2133 }, { "epoch": 0.43050367694572367, "grad_norm": 0.4136514365673065, "learning_rate": 6.350557059101757e-06, "loss": 0.6429, "step": 2134 }, { "epoch": 0.4307054125019307, "grad_norm": 0.6138867139816284, "learning_rate": 6.347410508423464e-06, "loss": 0.6956, "step": 2135 }, { "epoch": 0.4309071480581377, "grad_norm": 0.8477742075920105, "learning_rate": 6.344263382237106e-06, "loss": 0.7875, "step": 2136 }, { "epoch": 0.4311088836143447, "grad_norm": 0.5474502444267273, "learning_rate": 6.341115681886885e-06, "loss": 0.6698, "step": 2137 }, { "epoch": 0.43131061917055163, "grad_norm": 0.6278495192527771, "learning_rate": 6.337967408717254e-06, "loss": 0.6913, "step": 2138 }, { "epoch": 0.43151235472675864, "grad_norm": 0.40545186400413513, "learning_rate": 6.334818564072906e-06, "loss": 0.629, "step": 2139 }, { "epoch": 0.43171409028296565, "grad_norm": 0.4732603430747986, "learning_rate": 6.331669149298781e-06, "loss": 0.6154, "step": 2140 }, { "epoch": 0.43191582583917265, "grad_norm": 0.7837668657302856, "learning_rate": 6.328519165740063e-06, "loss": 0.77, "step": 2141 }, { "epoch": 0.4321175613953796, "grad_norm": 0.35569074749946594, "learning_rate": 6.325368614742177e-06, "loss": 0.661, "step": 2142 }, { "epoch": 0.4323192969515866, "grad_norm": 0.5048931837081909, "learning_rate": 6.322217497650794e-06, "loss": 0.6205, "step": 2143 }, { "epoch": 0.4325210325077936, "grad_norm": 0.4467771053314209, "learning_rate": 6.3190658158118205e-06, "loss": 0.6533, "step": 2144 }, { "epoch": 0.4327227680640006, "grad_norm": 0.3703531324863434, "learning_rate": 6.315913570571408e-06, "loss": 0.6979, "step": 2145 }, { "epoch": 0.4329245036202076, "grad_norm": 0.8720201849937439, "learning_rate": 6.312760763275949e-06, "loss": 0.6939, "step": 2146 }, { "epoch": 0.4331262391764146, "grad_norm": 0.3208676278591156, "learning_rate": 6.3096073952720775e-06, "loss": 0.6957, "step": 2147 }, { "epoch": 0.4333279747326216, "grad_norm": 0.351581335067749, "learning_rate": 6.306453467906663e-06, "loss": 0.658, "step": 2148 }, { "epoch": 0.4335297102888286, "grad_norm": 0.44626230001449585, "learning_rate": 6.303298982526813e-06, "loss": 0.7386, "step": 2149 }, { "epoch": 0.4337314458450356, "grad_norm": 0.42948535084724426, "learning_rate": 6.300143940479881e-06, "loss": 0.6874, "step": 2150 }, { "epoch": 0.43393318140124254, "grad_norm": 0.4214824140071869, "learning_rate": 6.296988343113453e-06, "loss": 0.8205, "step": 2151 }, { "epoch": 0.43413491695744955, "grad_norm": 0.36822807788848877, "learning_rate": 6.29383219177535e-06, "loss": 0.6291, "step": 2152 }, { "epoch": 0.43433665251365655, "grad_norm": 0.9206970930099487, "learning_rate": 6.290675487813632e-06, "loss": 0.695, "step": 2153 }, { "epoch": 0.43453838806986356, "grad_norm": 0.3771558403968811, "learning_rate": 6.2875182325765995e-06, "loss": 0.8117, "step": 2154 }, { "epoch": 0.43474012362607056, "grad_norm": 0.8105612993240356, "learning_rate": 6.284360427412781e-06, "loss": 0.6993, "step": 2155 }, { "epoch": 0.4349418591822775, "grad_norm": 0.9626283645629883, "learning_rate": 6.281202073670942e-06, "loss": 0.6851, "step": 2156 }, { "epoch": 0.4351435947384845, "grad_norm": 0.3421020209789276, "learning_rate": 6.2780431727000865e-06, "loss": 0.6535, "step": 2157 }, { "epoch": 0.4353453302946915, "grad_norm": 0.40301311016082764, "learning_rate": 6.274883725849449e-06, "loss": 0.7201, "step": 2158 }, { "epoch": 0.43554706585089853, "grad_norm": 0.32986417412757874, "learning_rate": 6.271723734468496e-06, "loss": 0.7125, "step": 2159 }, { "epoch": 0.4357488014071055, "grad_norm": 0.3987983167171478, "learning_rate": 6.268563199906925e-06, "loss": 0.668, "step": 2160 }, { "epoch": 0.4359505369633125, "grad_norm": 1.0826027393341064, "learning_rate": 6.2654021235146745e-06, "loss": 0.8437, "step": 2161 }, { "epoch": 0.4361522725195195, "grad_norm": 0.6496895551681519, "learning_rate": 6.2622405066419046e-06, "loss": 0.694, "step": 2162 }, { "epoch": 0.4363540080757265, "grad_norm": 0.9457875490188599, "learning_rate": 6.25907835063901e-06, "loss": 0.6251, "step": 2163 }, { "epoch": 0.4365557436319335, "grad_norm": 0.4565484821796417, "learning_rate": 6.2559156568566185e-06, "loss": 0.7898, "step": 2164 }, { "epoch": 0.43675747918814045, "grad_norm": 1.442725658416748, "learning_rate": 6.252752426645581e-06, "loss": 0.7358, "step": 2165 }, { "epoch": 0.43695921474434746, "grad_norm": 1.2952994108200073, "learning_rate": 6.249588661356983e-06, "loss": 0.6899, "step": 2166 }, { "epoch": 0.43716095030055446, "grad_norm": 1.1040253639221191, "learning_rate": 6.246424362342139e-06, "loss": 0.7184, "step": 2167 }, { "epoch": 0.43736268585676147, "grad_norm": 1.4704509973526, "learning_rate": 6.243259530952585e-06, "loss": 0.6943, "step": 2168 }, { "epoch": 0.4375644214129684, "grad_norm": 1.258582592010498, "learning_rate": 6.240094168540091e-06, "loss": 0.8548, "step": 2169 }, { "epoch": 0.4377661569691754, "grad_norm": 0.42416098713874817, "learning_rate": 6.236928276456652e-06, "loss": 0.7267, "step": 2170 }, { "epoch": 0.43796789252538243, "grad_norm": 0.40356701612472534, "learning_rate": 6.233761856054488e-06, "loss": 0.7985, "step": 2171 }, { "epoch": 0.43816962808158944, "grad_norm": 0.360816091299057, "learning_rate": 6.230594908686045e-06, "loss": 0.6553, "step": 2172 }, { "epoch": 0.43837136363779644, "grad_norm": 0.8857923746109009, "learning_rate": 6.227427435703997e-06, "loss": 0.6654, "step": 2173 }, { "epoch": 0.4385730991940034, "grad_norm": 0.4565891921520233, "learning_rate": 6.224259438461235e-06, "loss": 0.7137, "step": 2174 }, { "epoch": 0.4387748347502104, "grad_norm": 0.4995157718658447, "learning_rate": 6.221090918310885e-06, "loss": 0.9537, "step": 2175 }, { "epoch": 0.4389765703064174, "grad_norm": 0.4343486428260803, "learning_rate": 6.217921876606285e-06, "loss": 0.6915, "step": 2176 }, { "epoch": 0.4391783058626244, "grad_norm": 0.37906357645988464, "learning_rate": 6.214752314701003e-06, "loss": 0.8003, "step": 2177 }, { "epoch": 0.43938004141883136, "grad_norm": 0.5478761792182922, "learning_rate": 6.2115822339488296e-06, "loss": 0.7823, "step": 2178 }, { "epoch": 0.43958177697503836, "grad_norm": 0.5704029202461243, "learning_rate": 6.208411635703771e-06, "loss": 0.6973, "step": 2179 }, { "epoch": 0.43978351253124537, "grad_norm": 0.5843176245689392, "learning_rate": 6.205240521320059e-06, "loss": 0.6887, "step": 2180 }, { "epoch": 0.4399852480874524, "grad_norm": 0.3240997791290283, "learning_rate": 6.2020688921521454e-06, "loss": 0.761, "step": 2181 }, { "epoch": 0.4401869836436594, "grad_norm": 0.3329768478870392, "learning_rate": 6.1988967495547016e-06, "loss": 0.627, "step": 2182 }, { "epoch": 0.44038871919986633, "grad_norm": 0.4571416974067688, "learning_rate": 6.195724094882618e-06, "loss": 0.8011, "step": 2183 }, { "epoch": 0.44059045475607334, "grad_norm": 0.5178765058517456, "learning_rate": 6.192550929491002e-06, "loss": 0.6937, "step": 2184 }, { "epoch": 0.44079219031228034, "grad_norm": 1.414166808128357, "learning_rate": 6.189377254735184e-06, "loss": 0.6913, "step": 2185 }, { "epoch": 0.44099392586848735, "grad_norm": 0.5404744744300842, "learning_rate": 6.186203071970708e-06, "loss": 0.8938, "step": 2186 }, { "epoch": 0.4411956614246943, "grad_norm": 0.4626302123069763, "learning_rate": 6.183028382553334e-06, "loss": 0.6291, "step": 2187 }, { "epoch": 0.4413973969809013, "grad_norm": 0.5483360290527344, "learning_rate": 6.179853187839041e-06, "loss": 0.678, "step": 2188 }, { "epoch": 0.4415991325371083, "grad_norm": 0.3046145737171173, "learning_rate": 6.176677489184024e-06, "loss": 0.6738, "step": 2189 }, { "epoch": 0.4418008680933153, "grad_norm": 0.6439849138259888, "learning_rate": 6.173501287944692e-06, "loss": 0.6438, "step": 2190 }, { "epoch": 0.4420026036495223, "grad_norm": 0.5328143239021301, "learning_rate": 6.170324585477669e-06, "loss": 0.6751, "step": 2191 }, { "epoch": 0.44220433920572927, "grad_norm": 1.3237669467926025, "learning_rate": 6.167147383139793e-06, "loss": 0.7781, "step": 2192 }, { "epoch": 0.4424060747619363, "grad_norm": 0.4817297160625458, "learning_rate": 6.163969682288115e-06, "loss": 0.6905, "step": 2193 }, { "epoch": 0.4426078103181433, "grad_norm": 0.6590097546577454, "learning_rate": 6.160791484279901e-06, "loss": 0.6987, "step": 2194 }, { "epoch": 0.4428095458743503, "grad_norm": 0.3818492293357849, "learning_rate": 6.157612790472626e-06, "loss": 0.6845, "step": 2195 }, { "epoch": 0.44301128143055724, "grad_norm": 0.5485851764678955, "learning_rate": 6.154433602223979e-06, "loss": 0.6867, "step": 2196 }, { "epoch": 0.44321301698676424, "grad_norm": 1.5924900770187378, "learning_rate": 6.1512539208918634e-06, "loss": 0.6557, "step": 2197 }, { "epoch": 0.44341475254297125, "grad_norm": 0.5956331491470337, "learning_rate": 6.1480737478343844e-06, "loss": 0.6194, "step": 2198 }, { "epoch": 0.44361648809917825, "grad_norm": 0.3365457355976105, "learning_rate": 6.144893084409865e-06, "loss": 0.6527, "step": 2199 }, { "epoch": 0.44381822365538526, "grad_norm": 0.47032344341278076, "learning_rate": 6.141711931976835e-06, "loss": 0.8081, "step": 2200 }, { "epoch": 0.4440199592115922, "grad_norm": 0.6154392957687378, "learning_rate": 6.138530291894033e-06, "loss": 0.6787, "step": 2201 }, { "epoch": 0.4442216947677992, "grad_norm": 0.39080673456192017, "learning_rate": 6.135348165520405e-06, "loss": 0.6672, "step": 2202 }, { "epoch": 0.4444234303240062, "grad_norm": 0.4939618408679962, "learning_rate": 6.132165554215108e-06, "loss": 0.7767, "step": 2203 }, { "epoch": 0.4446251658802132, "grad_norm": 0.6760823130607605, "learning_rate": 6.128982459337502e-06, "loss": 0.6899, "step": 2204 }, { "epoch": 0.44482690143642023, "grad_norm": 0.5723358392715454, "learning_rate": 6.1257988822471556e-06, "loss": 0.6011, "step": 2205 }, { "epoch": 0.4450286369926272, "grad_norm": 0.30772972106933594, "learning_rate": 6.122614824303845e-06, "loss": 0.779, "step": 2206 }, { "epoch": 0.4452303725488342, "grad_norm": 0.9321260452270508, "learning_rate": 6.119430286867548e-06, "loss": 0.6848, "step": 2207 }, { "epoch": 0.4454321081050412, "grad_norm": 0.7563874125480652, "learning_rate": 6.11624527129845e-06, "loss": 0.6603, "step": 2208 }, { "epoch": 0.4456338436612482, "grad_norm": 0.4485842287540436, "learning_rate": 6.1130597789569376e-06, "loss": 0.7239, "step": 2209 }, { "epoch": 0.44583557921745515, "grad_norm": 0.3827153742313385, "learning_rate": 6.109873811203609e-06, "loss": 0.8143, "step": 2210 }, { "epoch": 0.44603731477366215, "grad_norm": 0.41676798462867737, "learning_rate": 6.106687369399254e-06, "loss": 0.6945, "step": 2211 }, { "epoch": 0.44623905032986916, "grad_norm": 0.5404744148254395, "learning_rate": 6.103500454904871e-06, "loss": 0.6511, "step": 2212 }, { "epoch": 0.44644078588607616, "grad_norm": 0.6789629459381104, "learning_rate": 6.100313069081662e-06, "loss": 0.6963, "step": 2213 }, { "epoch": 0.44664252144228317, "grad_norm": 0.3863263428211212, "learning_rate": 6.097125213291029e-06, "loss": 0.6801, "step": 2214 }, { "epoch": 0.4468442569984901, "grad_norm": 1.1624631881713867, "learning_rate": 6.093936888894573e-06, "loss": 0.7014, "step": 2215 }, { "epoch": 0.4470459925546971, "grad_norm": 0.3568146526813507, "learning_rate": 6.0907480972540915e-06, "loss": 0.6531, "step": 2216 }, { "epoch": 0.44724772811090413, "grad_norm": 0.5413874387741089, "learning_rate": 6.087558839731594e-06, "loss": 0.6698, "step": 2217 }, { "epoch": 0.44744946366711114, "grad_norm": 0.5732948780059814, "learning_rate": 6.084369117689276e-06, "loss": 0.659, "step": 2218 }, { "epoch": 0.4476511992233181, "grad_norm": 0.3843459486961365, "learning_rate": 6.0811789324895365e-06, "loss": 0.6573, "step": 2219 }, { "epoch": 0.4478529347795251, "grad_norm": 0.4645818769931793, "learning_rate": 6.0779882854949745e-06, "loss": 0.6783, "step": 2220 }, { "epoch": 0.4480546703357321, "grad_norm": 0.4214332103729248, "learning_rate": 6.074797178068385e-06, "loss": 0.685, "step": 2221 }, { "epoch": 0.4482564058919391, "grad_norm": 0.4040057361125946, "learning_rate": 6.071605611572755e-06, "loss": 0.6768, "step": 2222 }, { "epoch": 0.4484581414481461, "grad_norm": 0.5158857107162476, "learning_rate": 6.068413587371274e-06, "loss": 0.7078, "step": 2223 }, { "epoch": 0.44865987700435306, "grad_norm": 0.37071144580841064, "learning_rate": 6.0652211068273226e-06, "loss": 0.6374, "step": 2224 }, { "epoch": 0.44886161256056006, "grad_norm": 0.4070124924182892, "learning_rate": 6.062028171304481e-06, "loss": 0.7792, "step": 2225 }, { "epoch": 0.44906334811676707, "grad_norm": 0.40504300594329834, "learning_rate": 6.058834782166516e-06, "loss": 0.6528, "step": 2226 }, { "epoch": 0.4492650836729741, "grad_norm": 0.553276538848877, "learning_rate": 6.055640940777398e-06, "loss": 0.6665, "step": 2227 }, { "epoch": 0.449466819229181, "grad_norm": 0.41161584854125977, "learning_rate": 6.052446648501283e-06, "loss": 0.8529, "step": 2228 }, { "epoch": 0.44966855478538803, "grad_norm": 0.4330577552318573, "learning_rate": 6.049251906702522e-06, "loss": 0.8221, "step": 2229 }, { "epoch": 0.44987029034159504, "grad_norm": 0.43640753626823425, "learning_rate": 6.046056716745659e-06, "loss": 0.6714, "step": 2230 }, { "epoch": 0.45007202589780204, "grad_norm": 0.5692388415336609, "learning_rate": 6.042861079995428e-06, "loss": 0.8885, "step": 2231 }, { "epoch": 0.45027376145400905, "grad_norm": 0.49734705686569214, "learning_rate": 6.039664997816753e-06, "loss": 0.6671, "step": 2232 }, { "epoch": 0.450475497010216, "grad_norm": 0.3377302289009094, "learning_rate": 6.036468471574751e-06, "loss": 0.7033, "step": 2233 }, { "epoch": 0.450677232566423, "grad_norm": 0.40693655610084534, "learning_rate": 6.033271502634729e-06, "loss": 0.666, "step": 2234 }, { "epoch": 0.45087896812263, "grad_norm": 0.4038968086242676, "learning_rate": 6.030074092362178e-06, "loss": 0.6822, "step": 2235 }, { "epoch": 0.451080703678837, "grad_norm": 0.7137503623962402, "learning_rate": 6.026876242122782e-06, "loss": 0.6872, "step": 2236 }, { "epoch": 0.45128243923504396, "grad_norm": 0.4246538579463959, "learning_rate": 6.023677953282412e-06, "loss": 0.8179, "step": 2237 }, { "epoch": 0.45148417479125097, "grad_norm": 0.5865968465805054, "learning_rate": 6.020479227207127e-06, "loss": 0.6563, "step": 2238 }, { "epoch": 0.451685910347458, "grad_norm": 0.40608182549476624, "learning_rate": 6.0172800652631706e-06, "loss": 0.7347, "step": 2239 }, { "epoch": 0.451887645903665, "grad_norm": 0.7402623891830444, "learning_rate": 6.014080468816972e-06, "loss": 0.7196, "step": 2240 }, { "epoch": 0.452089381459872, "grad_norm": 0.37510138750076294, "learning_rate": 6.010880439235153e-06, "loss": 1.0391, "step": 2241 }, { "epoch": 0.45229111701607894, "grad_norm": 0.45044469833374023, "learning_rate": 6.0076799778845105e-06, "loss": 0.6827, "step": 2242 }, { "epoch": 0.45249285257228594, "grad_norm": 0.4569462835788727, "learning_rate": 6.004479086132033e-06, "loss": 0.6673, "step": 2243 }, { "epoch": 0.45269458812849295, "grad_norm": 0.9873457551002502, "learning_rate": 6.001277765344888e-06, "loss": 0.641, "step": 2244 }, { "epoch": 0.45289632368469995, "grad_norm": 2.229550361633301, "learning_rate": 5.998076016890432e-06, "loss": 0.7482, "step": 2245 }, { "epoch": 0.4530980592409069, "grad_norm": 0.4194773733615875, "learning_rate": 5.994873842136198e-06, "loss": 0.6847, "step": 2246 }, { "epoch": 0.4532997947971139, "grad_norm": 0.3104577958583832, "learning_rate": 5.991671242449906e-06, "loss": 0.7933, "step": 2247 }, { "epoch": 0.4535015303533209, "grad_norm": 0.3971177637577057, "learning_rate": 5.988468219199451e-06, "loss": 0.8049, "step": 2248 }, { "epoch": 0.4537032659095279, "grad_norm": 0.34223371744155884, "learning_rate": 5.985264773752919e-06, "loss": 0.6613, "step": 2249 }, { "epoch": 0.4539050014657349, "grad_norm": 0.4462783932685852, "learning_rate": 5.982060907478568e-06, "loss": 0.8033, "step": 2250 }, { "epoch": 0.4541067370219419, "grad_norm": 0.9888842701911926, "learning_rate": 5.978856621744837e-06, "loss": 0.7698, "step": 2251 }, { "epoch": 0.4543084725781489, "grad_norm": 0.4110148549079895, "learning_rate": 5.975651917920347e-06, "loss": 0.8328, "step": 2252 }, { "epoch": 0.4545102081343559, "grad_norm": 1.3794505596160889, "learning_rate": 5.9724467973738965e-06, "loss": 0.6795, "step": 2253 }, { "epoch": 0.4547119436905629, "grad_norm": 0.5329458713531494, "learning_rate": 5.969241261474461e-06, "loss": 0.7191, "step": 2254 }, { "epoch": 0.45491367924676984, "grad_norm": 0.3493848145008087, "learning_rate": 5.966035311591194e-06, "loss": 0.8103, "step": 2255 }, { "epoch": 0.45511541480297685, "grad_norm": 0.6665728092193604, "learning_rate": 5.962828949093424e-06, "loss": 0.7297, "step": 2256 }, { "epoch": 0.45531715035918385, "grad_norm": 0.39233672618865967, "learning_rate": 5.959622175350661e-06, "loss": 0.6564, "step": 2257 }, { "epoch": 0.45551888591539086, "grad_norm": 0.4845694303512573, "learning_rate": 5.9564149917325845e-06, "loss": 0.7227, "step": 2258 }, { "epoch": 0.45572062147159786, "grad_norm": 0.5083968639373779, "learning_rate": 5.953207399609053e-06, "loss": 0.7313, "step": 2259 }, { "epoch": 0.4559223570278048, "grad_norm": 0.6814498901367188, "learning_rate": 5.9499994003500975e-06, "loss": 0.6717, "step": 2260 }, { "epoch": 0.4561240925840118, "grad_norm": 0.3610351085662842, "learning_rate": 5.946790995325924e-06, "loss": 0.7272, "step": 2261 }, { "epoch": 0.4563258281402188, "grad_norm": 0.3892957270145416, "learning_rate": 5.943582185906911e-06, "loss": 0.6967, "step": 2262 }, { "epoch": 0.45652756369642583, "grad_norm": 0.45099541544914246, "learning_rate": 5.940372973463612e-06, "loss": 0.9041, "step": 2263 }, { "epoch": 0.4567292992526328, "grad_norm": 0.41133007407188416, "learning_rate": 5.937163359366747e-06, "loss": 0.7562, "step": 2264 }, { "epoch": 0.4569310348088398, "grad_norm": 1.6504077911376953, "learning_rate": 5.933953344987215e-06, "loss": 0.7812, "step": 2265 }, { "epoch": 0.4571327703650468, "grad_norm": 0.4914233982563019, "learning_rate": 5.9307429316960805e-06, "loss": 0.6183, "step": 2266 }, { "epoch": 0.4573345059212538, "grad_norm": 0.5438753962516785, "learning_rate": 5.927532120864582e-06, "loss": 0.6406, "step": 2267 }, { "epoch": 0.4575362414774608, "grad_norm": 0.7342792749404907, "learning_rate": 5.924320913864124e-06, "loss": 0.6925, "step": 2268 }, { "epoch": 0.45773797703366775, "grad_norm": 0.4444006681442261, "learning_rate": 5.921109312066282e-06, "loss": 0.7297, "step": 2269 }, { "epoch": 0.45793971258987476, "grad_norm": 1.223875880241394, "learning_rate": 5.917897316842803e-06, "loss": 0.6645, "step": 2270 }, { "epoch": 0.45814144814608176, "grad_norm": 0.3372178077697754, "learning_rate": 5.914684929565596e-06, "loss": 0.7088, "step": 2271 }, { "epoch": 0.45834318370228877, "grad_norm": 0.5606435537338257, "learning_rate": 5.911472151606743e-06, "loss": 0.8502, "step": 2272 }, { "epoch": 0.4585449192584957, "grad_norm": 0.32245010137557983, "learning_rate": 5.908258984338491e-06, "loss": 0.7166, "step": 2273 }, { "epoch": 0.4587466548147027, "grad_norm": 0.8593288660049438, "learning_rate": 5.905045429133252e-06, "loss": 0.6902, "step": 2274 }, { "epoch": 0.45894839037090973, "grad_norm": 0.4007122218608856, "learning_rate": 5.901831487363605e-06, "loss": 0.6997, "step": 2275 }, { "epoch": 0.45915012592711674, "grad_norm": 0.6089879274368286, "learning_rate": 5.8986171604022925e-06, "loss": 0.7023, "step": 2276 }, { "epoch": 0.45935186148332374, "grad_norm": 0.4027574360370636, "learning_rate": 5.895402449622226e-06, "loss": 0.6861, "step": 2277 }, { "epoch": 0.4595535970395307, "grad_norm": 0.9821425676345825, "learning_rate": 5.8921873563964745e-06, "loss": 0.6767, "step": 2278 }, { "epoch": 0.4597553325957377, "grad_norm": 0.41719210147857666, "learning_rate": 5.8889718820982754e-06, "loss": 0.8088, "step": 2279 }, { "epoch": 0.4599570681519447, "grad_norm": 0.5477870106697083, "learning_rate": 5.885756028101025e-06, "loss": 0.6984, "step": 2280 }, { "epoch": 0.4601588037081517, "grad_norm": 0.37010979652404785, "learning_rate": 5.882539795778287e-06, "loss": 0.7369, "step": 2281 }, { "epoch": 0.46036053926435866, "grad_norm": 6.755899906158447, "learning_rate": 5.879323186503783e-06, "loss": 0.7295, "step": 2282 }, { "epoch": 0.46056227482056566, "grad_norm": 0.49411219358444214, "learning_rate": 5.876106201651392e-06, "loss": 0.6711, "step": 2283 }, { "epoch": 0.46076401037677267, "grad_norm": 0.47513481974601746, "learning_rate": 5.872888842595163e-06, "loss": 0.6993, "step": 2284 }, { "epoch": 0.4609657459329797, "grad_norm": 0.3968546986579895, "learning_rate": 5.869671110709296e-06, "loss": 0.8708, "step": 2285 }, { "epoch": 0.4611674814891867, "grad_norm": 0.6728366613388062, "learning_rate": 5.866453007368154e-06, "loss": 0.6404, "step": 2286 }, { "epoch": 0.46136921704539363, "grad_norm": 0.7965627312660217, "learning_rate": 5.86323453394626e-06, "loss": 0.649, "step": 2287 }, { "epoch": 0.46157095260160064, "grad_norm": 0.8242064118385315, "learning_rate": 5.860015691818292e-06, "loss": 0.6907, "step": 2288 }, { "epoch": 0.46177268815780764, "grad_norm": 0.3499292731285095, "learning_rate": 5.856796482359089e-06, "loss": 0.6556, "step": 2289 }, { "epoch": 0.46197442371401465, "grad_norm": 0.3247630000114441, "learning_rate": 5.853576906943641e-06, "loss": 0.8573, "step": 2290 }, { "epoch": 0.4621761592702216, "grad_norm": 0.5543577671051025, "learning_rate": 5.8503569669471e-06, "loss": 0.6823, "step": 2291 }, { "epoch": 0.4623778948264286, "grad_norm": 0.8258817791938782, "learning_rate": 5.847136663744772e-06, "loss": 0.8946, "step": 2292 }, { "epoch": 0.4625796303826356, "grad_norm": 0.7056267857551575, "learning_rate": 5.843915998712117e-06, "loss": 0.8463, "step": 2293 }, { "epoch": 0.4627813659388426, "grad_norm": 0.6977629065513611, "learning_rate": 5.840694973224752e-06, "loss": 0.6792, "step": 2294 }, { "epoch": 0.4629831014950496, "grad_norm": 0.30397582054138184, "learning_rate": 5.837473588658444e-06, "loss": 0.6731, "step": 2295 }, { "epoch": 0.46318483705125657, "grad_norm": 0.8290293216705322, "learning_rate": 5.8342518463891195e-06, "loss": 0.6895, "step": 2296 }, { "epoch": 0.4633865726074636, "grad_norm": 1.0042904615402222, "learning_rate": 5.831029747792851e-06, "loss": 0.6827, "step": 2297 }, { "epoch": 0.4635883081636706, "grad_norm": 0.8199013471603394, "learning_rate": 5.827807294245867e-06, "loss": 0.6703, "step": 2298 }, { "epoch": 0.4637900437198776, "grad_norm": 0.4754752516746521, "learning_rate": 5.824584487124546e-06, "loss": 0.8643, "step": 2299 }, { "epoch": 0.4639917792760846, "grad_norm": 0.33841854333877563, "learning_rate": 5.821361327805419e-06, "loss": 0.7325, "step": 2300 }, { "epoch": 0.46419351483229154, "grad_norm": 0.8162549734115601, "learning_rate": 5.8181378176651696e-06, "loss": 0.6413, "step": 2301 }, { "epoch": 0.46439525038849855, "grad_norm": 0.7209305167198181, "learning_rate": 5.814913958080625e-06, "loss": 0.6518, "step": 2302 }, { "epoch": 0.46459698594470555, "grad_norm": 0.6614891886711121, "learning_rate": 5.811689750428765e-06, "loss": 0.7149, "step": 2303 }, { "epoch": 0.46479872150091256, "grad_norm": 0.5822973251342773, "learning_rate": 5.808465196086719e-06, "loss": 0.7242, "step": 2304 }, { "epoch": 0.4650004570571195, "grad_norm": 0.904875636100769, "learning_rate": 5.805240296431765e-06, "loss": 0.6861, "step": 2305 }, { "epoch": 0.4652021926133265, "grad_norm": 0.4102171063423157, "learning_rate": 5.802015052841328e-06, "loss": 0.6944, "step": 2306 }, { "epoch": 0.4654039281695335, "grad_norm": 0.5328947305679321, "learning_rate": 5.798789466692974e-06, "loss": 0.6568, "step": 2307 }, { "epoch": 0.4656056637257405, "grad_norm": 0.7053859233856201, "learning_rate": 5.795563539364424e-06, "loss": 0.6781, "step": 2308 }, { "epoch": 0.46580739928194753, "grad_norm": 0.38549986481666565, "learning_rate": 5.7923372722335415e-06, "loss": 0.6573, "step": 2309 }, { "epoch": 0.4660091348381545, "grad_norm": 0.5039394497871399, "learning_rate": 5.7891106666783325e-06, "loss": 0.6786, "step": 2310 }, { "epoch": 0.4662108703943615, "grad_norm": 0.3344820439815521, "learning_rate": 5.78588372407695e-06, "loss": 0.9494, "step": 2311 }, { "epoch": 0.4664126059505685, "grad_norm": 0.451581746339798, "learning_rate": 5.782656445807695e-06, "loss": 0.6447, "step": 2312 }, { "epoch": 0.4666143415067755, "grad_norm": 1.1283626556396484, "learning_rate": 5.779428833249003e-06, "loss": 0.6672, "step": 2313 }, { "epoch": 0.46681607706298245, "grad_norm": 0.5923398733139038, "learning_rate": 5.776200887779458e-06, "loss": 0.6433, "step": 2314 }, { "epoch": 0.46701781261918945, "grad_norm": 0.8438357710838318, "learning_rate": 5.7729726107777855e-06, "loss": 0.6469, "step": 2315 }, { "epoch": 0.46721954817539646, "grad_norm": 0.8239353895187378, "learning_rate": 5.769744003622852e-06, "loss": 0.7574, "step": 2316 }, { "epoch": 0.46742128373160347, "grad_norm": 0.6511779427528381, "learning_rate": 5.766515067693665e-06, "loss": 0.8149, "step": 2317 }, { "epoch": 0.46762301928781047, "grad_norm": 0.5120037794113159, "learning_rate": 5.7632858043693726e-06, "loss": 0.7148, "step": 2318 }, { "epoch": 0.4678247548440174, "grad_norm": 0.5220741629600525, "learning_rate": 5.760056215029263e-06, "loss": 0.6585, "step": 2319 }, { "epoch": 0.4680264904002244, "grad_norm": 0.3358466327190399, "learning_rate": 5.756826301052764e-06, "loss": 0.6783, "step": 2320 }, { "epoch": 0.46822822595643143, "grad_norm": 0.7241223454475403, "learning_rate": 5.753596063819441e-06, "loss": 0.6732, "step": 2321 }, { "epoch": 0.46842996151263844, "grad_norm": 0.45295026898384094, "learning_rate": 5.750365504708998e-06, "loss": 0.6714, "step": 2322 }, { "epoch": 0.4686316970688454, "grad_norm": 0.7881321310997009, "learning_rate": 5.747134625101275e-06, "loss": 0.6594, "step": 2323 }, { "epoch": 0.4688334326250524, "grad_norm": 0.6307108402252197, "learning_rate": 5.7439034263762526e-06, "loss": 0.6983, "step": 2324 }, { "epoch": 0.4690351681812594, "grad_norm": 0.3908398151397705, "learning_rate": 5.740671909914044e-06, "loss": 0.7215, "step": 2325 }, { "epoch": 0.4692369037374664, "grad_norm": 0.33366483449935913, "learning_rate": 5.7374400770949e-06, "loss": 0.671, "step": 2326 }, { "epoch": 0.4694386392936734, "grad_norm": 0.4603020250797272, "learning_rate": 5.734207929299206e-06, "loss": 0.6932, "step": 2327 }, { "epoch": 0.46964037484988036, "grad_norm": 0.6322477459907532, "learning_rate": 5.730975467907481e-06, "loss": 0.8183, "step": 2328 }, { "epoch": 0.46984211040608737, "grad_norm": 0.3986794650554657, "learning_rate": 5.727742694300381e-06, "loss": 0.6771, "step": 2329 }, { "epoch": 0.47004384596229437, "grad_norm": 0.3935398459434509, "learning_rate": 5.724509609858693e-06, "loss": 0.6629, "step": 2330 }, { "epoch": 0.4702455815185014, "grad_norm": 0.38128989934921265, "learning_rate": 5.7212762159633335e-06, "loss": 0.7115, "step": 2331 }, { "epoch": 0.4704473170747083, "grad_norm": 0.7728134393692017, "learning_rate": 5.718042513995359e-06, "loss": 0.6499, "step": 2332 }, { "epoch": 0.47064905263091533, "grad_norm": 0.8129534721374512, "learning_rate": 5.714808505335952e-06, "loss": 0.8108, "step": 2333 }, { "epoch": 0.47085078818712234, "grad_norm": 0.47534510493278503, "learning_rate": 5.711574191366427e-06, "loss": 0.7435, "step": 2334 }, { "epoch": 0.47105252374332934, "grad_norm": 0.700268566608429, "learning_rate": 5.708339573468227e-06, "loss": 0.6906, "step": 2335 }, { "epoch": 0.47125425929953635, "grad_norm": 0.34931257367134094, "learning_rate": 5.705104653022931e-06, "loss": 0.667, "step": 2336 }, { "epoch": 0.4714559948557433, "grad_norm": 0.9690538644790649, "learning_rate": 5.701869431412243e-06, "loss": 0.6849, "step": 2337 }, { "epoch": 0.4716577304119503, "grad_norm": 0.36594158411026, "learning_rate": 5.698633910017993e-06, "loss": 0.6393, "step": 2338 }, { "epoch": 0.4718594659681573, "grad_norm": 0.34695130586624146, "learning_rate": 5.695398090222141e-06, "loss": 0.7614, "step": 2339 }, { "epoch": 0.4720612015243643, "grad_norm": 0.5252466797828674, "learning_rate": 5.69216197340678e-06, "loss": 0.8025, "step": 2340 }, { "epoch": 0.47226293708057127, "grad_norm": 0.34003615379333496, "learning_rate": 5.6889255609541236e-06, "loss": 0.669, "step": 2341 }, { "epoch": 0.47246467263677827, "grad_norm": 0.8574584722518921, "learning_rate": 5.68568885424651e-06, "loss": 0.6524, "step": 2342 }, { "epoch": 0.4726664081929853, "grad_norm": 0.39517006278038025, "learning_rate": 5.682451854666411e-06, "loss": 0.6406, "step": 2343 }, { "epoch": 0.4728681437491923, "grad_norm": 0.6471952199935913, "learning_rate": 5.6792145635964156e-06, "loss": 0.8623, "step": 2344 }, { "epoch": 0.4730698793053993, "grad_norm": 0.4071753919124603, "learning_rate": 5.675976982419243e-06, "loss": 0.8123, "step": 2345 }, { "epoch": 0.47327161486160624, "grad_norm": 0.5992933511734009, "learning_rate": 5.672739112517732e-06, "loss": 0.764, "step": 2346 }, { "epoch": 0.47347335041781324, "grad_norm": 0.3176419734954834, "learning_rate": 5.669500955274847e-06, "loss": 0.6406, "step": 2347 }, { "epoch": 0.47367508597402025, "grad_norm": 0.5436950922012329, "learning_rate": 5.666262512073676e-06, "loss": 0.6682, "step": 2348 }, { "epoch": 0.47387682153022725, "grad_norm": 0.5929723381996155, "learning_rate": 5.663023784297426e-06, "loss": 0.6594, "step": 2349 }, { "epoch": 0.4740785570864342, "grad_norm": 0.9469460248947144, "learning_rate": 5.65978477332943e-06, "loss": 0.6563, "step": 2350 }, { "epoch": 0.4742802926426412, "grad_norm": 0.9490289688110352, "learning_rate": 5.656545480553135e-06, "loss": 0.659, "step": 2351 }, { "epoch": 0.4744820281988482, "grad_norm": 0.40818971395492554, "learning_rate": 5.653305907352118e-06, "loss": 0.6625, "step": 2352 }, { "epoch": 0.4746837637550552, "grad_norm": 0.354064017534256, "learning_rate": 5.650066055110067e-06, "loss": 0.7012, "step": 2353 }, { "epoch": 0.4748854993112622, "grad_norm": 0.5287762880325317, "learning_rate": 5.646825925210795e-06, "loss": 0.8059, "step": 2354 }, { "epoch": 0.4750872348674692, "grad_norm": 0.4946046769618988, "learning_rate": 5.6435855190382284e-06, "loss": 0.6788, "step": 2355 }, { "epoch": 0.4752889704236762, "grad_norm": 0.3169674873352051, "learning_rate": 5.640344837976417e-06, "loss": 0.6524, "step": 2356 }, { "epoch": 0.4754907059798832, "grad_norm": 0.36257556080818176, "learning_rate": 5.637103883409525e-06, "loss": 0.6573, "step": 2357 }, { "epoch": 0.4756924415360902, "grad_norm": 1.1422092914581299, "learning_rate": 5.6338626567218335e-06, "loss": 0.7541, "step": 2358 }, { "epoch": 0.47589417709229714, "grad_norm": 0.3955131769180298, "learning_rate": 5.63062115929774e-06, "loss": 0.8808, "step": 2359 }, { "epoch": 0.47609591264850415, "grad_norm": 0.705595850944519, "learning_rate": 5.627379392521758e-06, "loss": 0.7939, "step": 2360 }, { "epoch": 0.47629764820471115, "grad_norm": 0.5015299320220947, "learning_rate": 5.624137357778519e-06, "loss": 0.7529, "step": 2361 }, { "epoch": 0.47649938376091816, "grad_norm": 1.0474416017532349, "learning_rate": 5.620895056452761e-06, "loss": 0.6746, "step": 2362 }, { "epoch": 0.47670111931712517, "grad_norm": 0.6244195103645325, "learning_rate": 5.617652489929342e-06, "loss": 0.7225, "step": 2363 }, { "epoch": 0.4769028548733321, "grad_norm": 0.41735851764678955, "learning_rate": 5.614409659593234e-06, "loss": 0.73, "step": 2364 }, { "epoch": 0.4771045904295391, "grad_norm": 0.42424479126930237, "learning_rate": 5.61116656682952e-06, "loss": 0.6849, "step": 2365 }, { "epoch": 0.4773063259857461, "grad_norm": 1.1443331241607666, "learning_rate": 5.607923213023392e-06, "loss": 0.8486, "step": 2366 }, { "epoch": 0.47750806154195313, "grad_norm": 0.38195037841796875, "learning_rate": 5.604679599560159e-06, "loss": 0.661, "step": 2367 }, { "epoch": 0.4777097970981601, "grad_norm": 0.5178070664405823, "learning_rate": 5.601435727825237e-06, "loss": 0.6907, "step": 2368 }, { "epoch": 0.4779115326543671, "grad_norm": 0.6520282030105591, "learning_rate": 5.598191599204153e-06, "loss": 0.7514, "step": 2369 }, { "epoch": 0.4781132682105741, "grad_norm": 0.9794221520423889, "learning_rate": 5.594947215082545e-06, "loss": 0.6626, "step": 2370 }, { "epoch": 0.4783150037667811, "grad_norm": 0.46517887711524963, "learning_rate": 5.59170257684616e-06, "loss": 0.8643, "step": 2371 }, { "epoch": 0.4785167393229881, "grad_norm": 0.6313295364379883, "learning_rate": 5.588457685880851e-06, "loss": 0.7549, "step": 2372 }, { "epoch": 0.47871847487919506, "grad_norm": 0.33170410990715027, "learning_rate": 5.585212543572585e-06, "loss": 0.898, "step": 2373 }, { "epoch": 0.47892021043540206, "grad_norm": 0.9356674551963806, "learning_rate": 5.5819671513074256e-06, "loss": 0.7104, "step": 2374 }, { "epoch": 0.47912194599160907, "grad_norm": 0.4888406991958618, "learning_rate": 5.578721510471554e-06, "loss": 0.8359, "step": 2375 }, { "epoch": 0.47932368154781607, "grad_norm": 0.5194557905197144, "learning_rate": 5.575475622451255e-06, "loss": 0.6816, "step": 2376 }, { "epoch": 0.479525417104023, "grad_norm": 0.4818446934223175, "learning_rate": 5.572229488632913e-06, "loss": 0.6913, "step": 2377 }, { "epoch": 0.47972715266023, "grad_norm": 0.9433100819587708, "learning_rate": 5.568983110403025e-06, "loss": 0.6474, "step": 2378 }, { "epoch": 0.47992888821643703, "grad_norm": 0.5533755421638489, "learning_rate": 5.565736489148188e-06, "loss": 0.8286, "step": 2379 }, { "epoch": 0.48013062377264404, "grad_norm": 0.416610449552536, "learning_rate": 5.562489626255104e-06, "loss": 0.6741, "step": 2380 }, { "epoch": 0.48033235932885104, "grad_norm": 0.38036978244781494, "learning_rate": 5.559242523110577e-06, "loss": 0.6823, "step": 2381 }, { "epoch": 0.480534094885058, "grad_norm": 0.33574825525283813, "learning_rate": 5.555995181101517e-06, "loss": 0.7284, "step": 2382 }, { "epoch": 0.480735830441265, "grad_norm": 0.5829797983169556, "learning_rate": 5.552747601614932e-06, "loss": 0.6931, "step": 2383 }, { "epoch": 0.480937565997472, "grad_norm": 1.0719894170761108, "learning_rate": 5.549499786037932e-06, "loss": 0.6811, "step": 2384 }, { "epoch": 0.481139301553679, "grad_norm": 0.27220767736434937, "learning_rate": 5.5462517357577325e-06, "loss": 0.7297, "step": 2385 }, { "epoch": 0.48134103710988596, "grad_norm": 0.42966559529304504, "learning_rate": 5.543003452161644e-06, "loss": 0.6712, "step": 2386 }, { "epoch": 0.48154277266609297, "grad_norm": 1.9052566289901733, "learning_rate": 5.539754936637079e-06, "loss": 0.6926, "step": 2387 }, { "epoch": 0.48174450822229997, "grad_norm": 0.33792710304260254, "learning_rate": 5.536506190571546e-06, "loss": 0.6334, "step": 2388 }, { "epoch": 0.481946243778507, "grad_norm": 0.7136744260787964, "learning_rate": 5.5332572153526574e-06, "loss": 0.6855, "step": 2389 }, { "epoch": 0.482147979334714, "grad_norm": 2.199124574661255, "learning_rate": 5.530008012368119e-06, "loss": 0.6911, "step": 2390 }, { "epoch": 0.48234971489092093, "grad_norm": 1.1589012145996094, "learning_rate": 5.526758583005736e-06, "loss": 0.7076, "step": 2391 }, { "epoch": 0.48255145044712794, "grad_norm": 1.6382700204849243, "learning_rate": 5.52350892865341e-06, "loss": 0.6658, "step": 2392 }, { "epoch": 0.48275318600333494, "grad_norm": 1.535251259803772, "learning_rate": 5.520259050699138e-06, "loss": 0.8544, "step": 2393 }, { "epoch": 0.48295492155954195, "grad_norm": 0.8162296414375305, "learning_rate": 5.517008950531013e-06, "loss": 0.7948, "step": 2394 }, { "epoch": 0.4831566571157489, "grad_norm": 0.4203198254108429, "learning_rate": 5.5137586295372215e-06, "loss": 0.6362, "step": 2395 }, { "epoch": 0.4833583926719559, "grad_norm": 0.3570404648780823, "learning_rate": 5.510508089106049e-06, "loss": 0.6748, "step": 2396 }, { "epoch": 0.4835601282281629, "grad_norm": 0.5715703964233398, "learning_rate": 5.507257330625869e-06, "loss": 0.6902, "step": 2397 }, { "epoch": 0.4837618637843699, "grad_norm": 0.3944864571094513, "learning_rate": 5.50400635548515e-06, "loss": 0.6548, "step": 2398 }, { "epoch": 0.4839635993405769, "grad_norm": 0.45001280307769775, "learning_rate": 5.500755165072453e-06, "loss": 0.7976, "step": 2399 }, { "epoch": 0.48416533489678387, "grad_norm": 0.5098247528076172, "learning_rate": 5.497503760776436e-06, "loss": 0.6763, "step": 2400 }, { "epoch": 0.4843670704529909, "grad_norm": 0.478303998708725, "learning_rate": 5.4942521439858386e-06, "loss": 0.6613, "step": 2401 }, { "epoch": 0.4845688060091979, "grad_norm": 0.4981384873390198, "learning_rate": 5.491000316089499e-06, "loss": 0.7707, "step": 2402 }, { "epoch": 0.4847705415654049, "grad_norm": 0.579944908618927, "learning_rate": 5.487748278476342e-06, "loss": 0.7008, "step": 2403 }, { "epoch": 0.4849722771216119, "grad_norm": 0.4089328348636627, "learning_rate": 5.484496032535385e-06, "loss": 0.6602, "step": 2404 }, { "epoch": 0.48517401267781884, "grad_norm": 0.6986388564109802, "learning_rate": 5.48124357965573e-06, "loss": 0.6808, "step": 2405 }, { "epoch": 0.48537574823402585, "grad_norm": 0.3838619589805603, "learning_rate": 5.477990921226569e-06, "loss": 0.6708, "step": 2406 }, { "epoch": 0.48557748379023286, "grad_norm": 0.38885095715522766, "learning_rate": 5.474738058637185e-06, "loss": 0.6856, "step": 2407 }, { "epoch": 0.48577921934643986, "grad_norm": 2.982579469680786, "learning_rate": 5.471484993276945e-06, "loss": 0.7715, "step": 2408 }, { "epoch": 0.4859809549026468, "grad_norm": 0.6595232486724854, "learning_rate": 5.4682317265353025e-06, "loss": 0.6493, "step": 2409 }, { "epoch": 0.4861826904588538, "grad_norm": 0.39451122283935547, "learning_rate": 5.464978259801797e-06, "loss": 0.6916, "step": 2410 }, { "epoch": 0.4863844260150608, "grad_norm": 0.5617831349372864, "learning_rate": 5.461724594466059e-06, "loss": 0.7956, "step": 2411 }, { "epoch": 0.48658616157126783, "grad_norm": 1.7667847871780396, "learning_rate": 5.458470731917794e-06, "loss": 0.7746, "step": 2412 }, { "epoch": 0.48678789712747483, "grad_norm": 1.2532994747161865, "learning_rate": 5.455216673546798e-06, "loss": 0.6748, "step": 2413 }, { "epoch": 0.4869896326836818, "grad_norm": 0.6899948716163635, "learning_rate": 5.451962420742951e-06, "loss": 0.7031, "step": 2414 }, { "epoch": 0.4871913682398888, "grad_norm": 0.8961516618728638, "learning_rate": 5.448707974896214e-06, "loss": 0.7372, "step": 2415 }, { "epoch": 0.4873931037960958, "grad_norm": 1.3983064889907837, "learning_rate": 5.445453337396629e-06, "loss": 0.6774, "step": 2416 }, { "epoch": 0.4875948393523028, "grad_norm": 0.7456166744232178, "learning_rate": 5.442198509634324e-06, "loss": 0.6806, "step": 2417 }, { "epoch": 0.48779657490850975, "grad_norm": 0.5494102835655212, "learning_rate": 5.438943492999504e-06, "loss": 0.6734, "step": 2418 }, { "epoch": 0.48799831046471676, "grad_norm": 1.1771401166915894, "learning_rate": 5.435688288882461e-06, "loss": 0.813, "step": 2419 }, { "epoch": 0.48820004602092376, "grad_norm": 0.3710864186286926, "learning_rate": 5.432432898673558e-06, "loss": 0.7281, "step": 2420 }, { "epoch": 0.48840178157713077, "grad_norm": 0.4745769500732422, "learning_rate": 5.429177323763245e-06, "loss": 0.7416, "step": 2421 }, { "epoch": 0.4886035171333378, "grad_norm": 1.0034306049346924, "learning_rate": 5.425921565542047e-06, "loss": 0.6967, "step": 2422 }, { "epoch": 0.4888052526895447, "grad_norm": 0.498399555683136, "learning_rate": 5.4226656254005686e-06, "loss": 0.6518, "step": 2423 }, { "epoch": 0.48900698824575173, "grad_norm": 0.38673850893974304, "learning_rate": 5.4194095047294935e-06, "loss": 0.7897, "step": 2424 }, { "epoch": 0.48920872380195873, "grad_norm": 0.32360196113586426, "learning_rate": 5.41615320491958e-06, "loss": 0.7278, "step": 2425 }, { "epoch": 0.48941045935816574, "grad_norm": 0.7270029783248901, "learning_rate": 5.412896727361663e-06, "loss": 0.6923, "step": 2426 }, { "epoch": 0.4896121949143727, "grad_norm": 0.5027311444282532, "learning_rate": 5.409640073446654e-06, "loss": 0.6943, "step": 2427 }, { "epoch": 0.4898139304705797, "grad_norm": 0.6347085237503052, "learning_rate": 5.406383244565543e-06, "loss": 0.7144, "step": 2428 }, { "epoch": 0.4900156660267867, "grad_norm": 0.40251901745796204, "learning_rate": 5.40312624210939e-06, "loss": 0.6694, "step": 2429 }, { "epoch": 0.4902174015829937, "grad_norm": 0.5375779271125793, "learning_rate": 5.3998690674693286e-06, "loss": 0.6423, "step": 2430 }, { "epoch": 0.4904191371392007, "grad_norm": 0.5994971394538879, "learning_rate": 5.396611722036573e-06, "loss": 0.6715, "step": 2431 }, { "epoch": 0.49062087269540766, "grad_norm": 0.38045647740364075, "learning_rate": 5.393354207202404e-06, "loss": 0.7003, "step": 2432 }, { "epoch": 0.49082260825161467, "grad_norm": 0.4006095230579376, "learning_rate": 5.390096524358175e-06, "loss": 0.9497, "step": 2433 }, { "epoch": 0.4910243438078217, "grad_norm": 0.7631081938743591, "learning_rate": 5.386838674895311e-06, "loss": 0.6707, "step": 2434 }, { "epoch": 0.4912260793640287, "grad_norm": 0.3868980407714844, "learning_rate": 5.383580660205313e-06, "loss": 0.657, "step": 2435 }, { "epoch": 0.49142781492023563, "grad_norm": 0.3521217405796051, "learning_rate": 5.3803224816797495e-06, "loss": 0.8157, "step": 2436 }, { "epoch": 0.49162955047644263, "grad_norm": 0.43658751249313354, "learning_rate": 5.3770641407102554e-06, "loss": 0.6828, "step": 2437 }, { "epoch": 0.49183128603264964, "grad_norm": 0.375871479511261, "learning_rate": 5.373805638688542e-06, "loss": 0.6761, "step": 2438 }, { "epoch": 0.49203302158885664, "grad_norm": 0.8562093377113342, "learning_rate": 5.370546977006383e-06, "loss": 0.7243, "step": 2439 }, { "epoch": 0.49223475714506365, "grad_norm": 0.3729056119918823, "learning_rate": 5.367288157055626e-06, "loss": 0.7204, "step": 2440 }, { "epoch": 0.4924364927012706, "grad_norm": 0.8656922578811646, "learning_rate": 5.36402918022818e-06, "loss": 0.8264, "step": 2441 }, { "epoch": 0.4926382282574776, "grad_norm": 0.4582517445087433, "learning_rate": 5.360770047916025e-06, "loss": 0.6895, "step": 2442 }, { "epoch": 0.4928399638136846, "grad_norm": 0.7141796946525574, "learning_rate": 5.3575107615112084e-06, "loss": 0.7993, "step": 2443 }, { "epoch": 0.4930416993698916, "grad_norm": 0.6745219826698303, "learning_rate": 5.35425132240584e-06, "loss": 0.6812, "step": 2444 }, { "epoch": 0.49324343492609857, "grad_norm": 0.3646477162837982, "learning_rate": 5.350991731992098e-06, "loss": 0.6652, "step": 2445 }, { "epoch": 0.4934451704823056, "grad_norm": 0.8088419437408447, "learning_rate": 5.3477319916622215e-06, "loss": 0.7443, "step": 2446 }, { "epoch": 0.4936469060385126, "grad_norm": 0.41926172375679016, "learning_rate": 5.344472102808519e-06, "loss": 0.6607, "step": 2447 }, { "epoch": 0.4938486415947196, "grad_norm": 1.1572102308273315, "learning_rate": 5.341212066823356e-06, "loss": 0.7375, "step": 2448 }, { "epoch": 0.4940503771509266, "grad_norm": 0.433287650346756, "learning_rate": 5.337951885099167e-06, "loss": 0.7003, "step": 2449 }, { "epoch": 0.49425211270713354, "grad_norm": 0.48769572377204895, "learning_rate": 5.334691559028442e-06, "loss": 0.8846, "step": 2450 }, { "epoch": 0.49445384826334055, "grad_norm": 0.565112829208374, "learning_rate": 5.331431090003739e-06, "loss": 0.6548, "step": 2451 }, { "epoch": 0.49465558381954755, "grad_norm": 1.2953200340270996, "learning_rate": 5.328170479417676e-06, "loss": 0.6966, "step": 2452 }, { "epoch": 0.49485731937575456, "grad_norm": 1.8815406560897827, "learning_rate": 5.324909728662929e-06, "loss": 0.6353, "step": 2453 }, { "epoch": 0.4950590549319615, "grad_norm": 1.499898076057434, "learning_rate": 5.321648839132233e-06, "loss": 0.6948, "step": 2454 }, { "epoch": 0.4952607904881685, "grad_norm": 2.585374355316162, "learning_rate": 5.318387812218386e-06, "loss": 0.6784, "step": 2455 }, { "epoch": 0.4954625260443755, "grad_norm": 1.7011631727218628, "learning_rate": 5.315126649314244e-06, "loss": 0.6391, "step": 2456 }, { "epoch": 0.4956642616005825, "grad_norm": 1.2485464811325073, "learning_rate": 5.311865351812718e-06, "loss": 0.6842, "step": 2457 }, { "epoch": 0.49586599715678953, "grad_norm": 1.5083630084991455, "learning_rate": 5.308603921106777e-06, "loss": 0.6877, "step": 2458 }, { "epoch": 0.4960677327129965, "grad_norm": 0.4522298574447632, "learning_rate": 5.305342358589452e-06, "loss": 0.6042, "step": 2459 }, { "epoch": 0.4962694682692035, "grad_norm": 0.3971395790576935, "learning_rate": 5.302080665653826e-06, "loss": 0.6506, "step": 2460 }, { "epoch": 0.4964712038254105, "grad_norm": 0.4974903166294098, "learning_rate": 5.298818843693035e-06, "loss": 0.7048, "step": 2461 }, { "epoch": 0.4966729393816175, "grad_norm": 0.8520945310592651, "learning_rate": 5.295556894100278e-06, "loss": 0.9368, "step": 2462 }, { "epoch": 0.49687467493782445, "grad_norm": 0.36369588971138, "learning_rate": 5.292294818268801e-06, "loss": 0.6535, "step": 2463 }, { "epoch": 0.49707641049403145, "grad_norm": 0.39193958044052124, "learning_rate": 5.289032617591908e-06, "loss": 0.6962, "step": 2464 }, { "epoch": 0.49727814605023846, "grad_norm": 0.5720986127853394, "learning_rate": 5.285770293462954e-06, "loss": 0.6672, "step": 2465 }, { "epoch": 0.49747988160644546, "grad_norm": 0.444812536239624, "learning_rate": 5.2825078472753476e-06, "loss": 0.6347, "step": 2466 }, { "epoch": 0.49768161716265247, "grad_norm": 0.45876580476760864, "learning_rate": 5.2792452804225535e-06, "loss": 0.725, "step": 2467 }, { "epoch": 0.4978833527188594, "grad_norm": 0.6092671751976013, "learning_rate": 5.275982594298081e-06, "loss": 0.763, "step": 2468 }, { "epoch": 0.4980850882750664, "grad_norm": 0.5046172142028809, "learning_rate": 5.2727197902954954e-06, "loss": 0.7659, "step": 2469 }, { "epoch": 0.49828682383127343, "grad_norm": 0.3817034959793091, "learning_rate": 5.2694568698084085e-06, "loss": 0.8996, "step": 2470 }, { "epoch": 0.49848855938748043, "grad_norm": 0.3855259418487549, "learning_rate": 5.266193834230485e-06, "loss": 0.7581, "step": 2471 }, { "epoch": 0.4986902949436874, "grad_norm": 0.6069688200950623, "learning_rate": 5.262930684955439e-06, "loss": 0.7928, "step": 2472 }, { "epoch": 0.4988920304998944, "grad_norm": 0.3956807851791382, "learning_rate": 5.25966742337703e-06, "loss": 0.66, "step": 2473 }, { "epoch": 0.4990937660561014, "grad_norm": 0.43637824058532715, "learning_rate": 5.256404050889069e-06, "loss": 0.7644, "step": 2474 }, { "epoch": 0.4992955016123084, "grad_norm": 0.49486860632896423, "learning_rate": 5.253140568885412e-06, "loss": 0.7436, "step": 2475 }, { "epoch": 0.4994972371685154, "grad_norm": 0.4755359888076782, "learning_rate": 5.249876978759961e-06, "loss": 0.6725, "step": 2476 }, { "epoch": 0.49969897272472236, "grad_norm": 0.5608794689178467, "learning_rate": 5.246613281906669e-06, "loss": 0.6263, "step": 2477 }, { "epoch": 0.49990070828092936, "grad_norm": 0.692234992980957, "learning_rate": 5.243349479719528e-06, "loss": 0.6567, "step": 2478 }, { "epoch": 0.5001024438371363, "grad_norm": 0.9859315156936646, "learning_rate": 5.240085573592579e-06, "loss": 0.7421, "step": 2479 }, { "epoch": 0.5003041793933434, "grad_norm": 0.5274341106414795, "learning_rate": 5.236821564919909e-06, "loss": 0.6897, "step": 2480 }, { "epoch": 0.5005059149495503, "grad_norm": 0.6965314149856567, "learning_rate": 5.233557455095645e-06, "loss": 0.6506, "step": 2481 }, { "epoch": 0.5007076505057574, "grad_norm": 0.578256368637085, "learning_rate": 5.230293245513956e-06, "loss": 0.6574, "step": 2482 }, { "epoch": 0.5009093860619643, "grad_norm": 0.34265127778053284, "learning_rate": 5.22702893756906e-06, "loss": 0.6711, "step": 2483 }, { "epoch": 0.5011111216181713, "grad_norm": 0.5414084196090698, "learning_rate": 5.2237645326552125e-06, "loss": 0.6894, "step": 2484 }, { "epoch": 0.5013128571743783, "grad_norm": 0.41821205615997314, "learning_rate": 5.220500032166709e-06, "loss": 0.8267, "step": 2485 }, { "epoch": 0.5015145927305853, "grad_norm": 1.5558137893676758, "learning_rate": 5.2172354374978905e-06, "loss": 0.7558, "step": 2486 }, { "epoch": 0.5017163282867924, "grad_norm": 1.021885871887207, "learning_rate": 5.213970750043135e-06, "loss": 0.8056, "step": 2487 }, { "epoch": 0.5019180638429993, "grad_norm": 1.5603513717651367, "learning_rate": 5.210705971196861e-06, "loss": 0.6761, "step": 2488 }, { "epoch": 0.5021197993992063, "grad_norm": 0.47618329524993896, "learning_rate": 5.207441102353524e-06, "loss": 0.6797, "step": 2489 }, { "epoch": 0.5023215349554133, "grad_norm": 0.36025211215019226, "learning_rate": 5.204176144907624e-06, "loss": 0.8224, "step": 2490 }, { "epoch": 0.5025232705116203, "grad_norm": 1.2106201648712158, "learning_rate": 5.20091110025369e-06, "loss": 0.717, "step": 2491 }, { "epoch": 0.5027250060678273, "grad_norm": 0.40544193983078003, "learning_rate": 5.197645969786297e-06, "loss": 0.6746, "step": 2492 }, { "epoch": 0.5029267416240343, "grad_norm": 0.34461602568626404, "learning_rate": 5.194380754900049e-06, "loss": 0.7017, "step": 2493 }, { "epoch": 0.5031284771802412, "grad_norm": 0.5607613921165466, "learning_rate": 5.1911154569895915e-06, "loss": 0.6919, "step": 2494 }, { "epoch": 0.5033302127364483, "grad_norm": 0.49895989894866943, "learning_rate": 5.187850077449604e-06, "loss": 0.8624, "step": 2495 }, { "epoch": 0.5035319482926552, "grad_norm": 0.5202046036720276, "learning_rate": 5.1845846176748005e-06, "loss": 0.7062, "step": 2496 }, { "epoch": 0.5037336838488622, "grad_norm": 0.42681393027305603, "learning_rate": 5.181319079059928e-06, "loss": 0.7279, "step": 2497 }, { "epoch": 0.5039354194050693, "grad_norm": 0.3662031292915344, "learning_rate": 5.178053462999768e-06, "loss": 0.6648, "step": 2498 }, { "epoch": 0.5041371549612762, "grad_norm": 0.4268733859062195, "learning_rate": 5.174787770889138e-06, "loss": 0.62, "step": 2499 }, { "epoch": 0.5043388905174833, "grad_norm": 0.4190344214439392, "learning_rate": 5.1715220041228835e-06, "loss": 0.6409, "step": 2500 }, { "epoch": 0.5045406260736902, "grad_norm": 0.4919296205043793, "learning_rate": 5.168256164095885e-06, "loss": 0.6389, "step": 2501 }, { "epoch": 0.5047423616298972, "grad_norm": 0.32788679003715515, "learning_rate": 5.164990252203052e-06, "loss": 0.6515, "step": 2502 }, { "epoch": 0.5049440971861042, "grad_norm": 0.4756167232990265, "learning_rate": 5.1617242698393265e-06, "loss": 0.6804, "step": 2503 }, { "epoch": 0.5051458327423112, "grad_norm": 0.49856603145599365, "learning_rate": 5.15845821839968e-06, "loss": 0.6365, "step": 2504 }, { "epoch": 0.5053475682985182, "grad_norm": 0.3664408028125763, "learning_rate": 5.155192099279113e-06, "loss": 0.6819, "step": 2505 }, { "epoch": 0.5055493038547252, "grad_norm": 0.637521505355835, "learning_rate": 5.151925913872657e-06, "loss": 0.6986, "step": 2506 }, { "epoch": 0.5057510394109321, "grad_norm": 0.29554644227027893, "learning_rate": 5.148659663575367e-06, "loss": 0.7517, "step": 2507 }, { "epoch": 0.5059527749671392, "grad_norm": 0.42966410517692566, "learning_rate": 5.1453933497823326e-06, "loss": 0.7161, "step": 2508 }, { "epoch": 0.5061545105233461, "grad_norm": 0.32631874084472656, "learning_rate": 5.1421269738886635e-06, "loss": 0.638, "step": 2509 }, { "epoch": 0.5063562460795532, "grad_norm": 0.39070239663124084, "learning_rate": 5.138860537289502e-06, "loss": 0.7415, "step": 2510 }, { "epoch": 0.5065579816357602, "grad_norm": 0.5472754240036011, "learning_rate": 5.135594041380012e-06, "loss": 0.6592, "step": 2511 }, { "epoch": 0.5067597171919671, "grad_norm": 0.603266716003418, "learning_rate": 5.132327487555385e-06, "loss": 0.6929, "step": 2512 }, { "epoch": 0.5069614527481742, "grad_norm": 1.3803391456604004, "learning_rate": 5.129060877210835e-06, "loss": 0.6848, "step": 2513 }, { "epoch": 0.5071631883043811, "grad_norm": 0.5233571529388428, "learning_rate": 5.125794211741602e-06, "loss": 1.0529, "step": 2514 }, { "epoch": 0.5073649238605881, "grad_norm": 0.5816826224327087, "learning_rate": 5.122527492542954e-06, "loss": 0.6554, "step": 2515 }, { "epoch": 0.5075666594167951, "grad_norm": 0.4094732105731964, "learning_rate": 5.119260721010171e-06, "loss": 0.6973, "step": 2516 }, { "epoch": 0.5077683949730021, "grad_norm": 0.6474040746688843, "learning_rate": 5.1159938985385625e-06, "loss": 0.6685, "step": 2517 }, { "epoch": 0.5079701305292091, "grad_norm": 0.47420260310173035, "learning_rate": 5.112727026523461e-06, "loss": 0.6705, "step": 2518 }, { "epoch": 0.5081718660854161, "grad_norm": 0.4265378415584564, "learning_rate": 5.1094601063602176e-06, "loss": 0.8524, "step": 2519 }, { "epoch": 0.508373601641623, "grad_norm": 1.0773259401321411, "learning_rate": 5.1061931394442045e-06, "loss": 0.6885, "step": 2520 }, { "epoch": 0.5085753371978301, "grad_norm": 0.4289955496788025, "learning_rate": 5.1029261271708104e-06, "loss": 0.6799, "step": 2521 }, { "epoch": 0.508777072754037, "grad_norm": 0.5529792904853821, "learning_rate": 5.099659070935451e-06, "loss": 0.8266, "step": 2522 }, { "epoch": 0.5089788083102441, "grad_norm": 0.7462150454521179, "learning_rate": 5.096391972133554e-06, "loss": 0.6685, "step": 2523 }, { "epoch": 0.5091805438664511, "grad_norm": 0.5908946394920349, "learning_rate": 5.093124832160569e-06, "loss": 0.668, "step": 2524 }, { "epoch": 0.509382279422658, "grad_norm": 0.7674736976623535, "learning_rate": 5.089857652411961e-06, "loss": 0.6632, "step": 2525 }, { "epoch": 0.5095840149788651, "grad_norm": 0.40639930963516235, "learning_rate": 5.086590434283212e-06, "loss": 0.6175, "step": 2526 }, { "epoch": 0.509785750535072, "grad_norm": 0.33126306533813477, "learning_rate": 5.083323179169824e-06, "loss": 0.6474, "step": 2527 }, { "epoch": 0.5099874860912791, "grad_norm": 0.30662772059440613, "learning_rate": 5.080055888467308e-06, "loss": 0.7701, "step": 2528 }, { "epoch": 0.510189221647486, "grad_norm": 0.3923175036907196, "learning_rate": 5.076788563571198e-06, "loss": 0.6567, "step": 2529 }, { "epoch": 0.510390957203693, "grad_norm": 0.5640500783920288, "learning_rate": 5.073521205877038e-06, "loss": 0.6456, "step": 2530 }, { "epoch": 0.5105926927599, "grad_norm": 0.6147575378417969, "learning_rate": 5.0702538167803864e-06, "loss": 0.6634, "step": 2531 }, { "epoch": 0.510794428316107, "grad_norm": 0.4823729693889618, "learning_rate": 5.0669863976768145e-06, "loss": 0.6735, "step": 2532 }, { "epoch": 0.510996163872314, "grad_norm": 0.47359368205070496, "learning_rate": 5.063718949961909e-06, "loss": 0.6704, "step": 2533 }, { "epoch": 0.511197899428521, "grad_norm": 2.3694827556610107, "learning_rate": 5.060451475031267e-06, "loss": 0.6973, "step": 2534 }, { "epoch": 0.511399634984728, "grad_norm": 0.32952338457107544, "learning_rate": 5.057183974280498e-06, "loss": 0.7599, "step": 2535 }, { "epoch": 0.511601370540935, "grad_norm": 0.5751401782035828, "learning_rate": 5.053916449105219e-06, "loss": 0.7763, "step": 2536 }, { "epoch": 0.511803106097142, "grad_norm": 0.3479835093021393, "learning_rate": 5.050648900901064e-06, "loss": 0.6997, "step": 2537 }, { "epoch": 0.5120048416533489, "grad_norm": 0.3522459864616394, "learning_rate": 5.047381331063672e-06, "loss": 0.8358, "step": 2538 }, { "epoch": 0.512206577209556, "grad_norm": 1.7044692039489746, "learning_rate": 5.044113740988692e-06, "loss": 0.8016, "step": 2539 }, { "epoch": 0.5124083127657629, "grad_norm": 0.4563331604003906, "learning_rate": 5.040846132071783e-06, "loss": 0.8318, "step": 2540 }, { "epoch": 0.51261004832197, "grad_norm": 0.47234177589416504, "learning_rate": 5.03757850570861e-06, "loss": 0.6481, "step": 2541 }, { "epoch": 0.5128117838781769, "grad_norm": 0.6565883755683899, "learning_rate": 5.034310863294847e-06, "loss": 0.8461, "step": 2542 }, { "epoch": 0.5130135194343839, "grad_norm": 0.44650590419769287, "learning_rate": 5.0310432062261764e-06, "loss": 0.684, "step": 2543 }, { "epoch": 0.513215254990591, "grad_norm": 0.4767093062400818, "learning_rate": 5.027775535898283e-06, "loss": 0.6856, "step": 2544 }, { "epoch": 0.5134169905467979, "grad_norm": 0.5772445797920227, "learning_rate": 5.024507853706858e-06, "loss": 0.6855, "step": 2545 }, { "epoch": 0.513618726103005, "grad_norm": 0.46750447154045105, "learning_rate": 5.021240161047601e-06, "loss": 0.6753, "step": 2546 }, { "epoch": 0.5138204616592119, "grad_norm": 0.733974277973175, "learning_rate": 5.0179724593162146e-06, "loss": 0.6634, "step": 2547 }, { "epoch": 0.5140221972154189, "grad_norm": 0.4525078535079956, "learning_rate": 5.014704749908404e-06, "loss": 0.7676, "step": 2548 }, { "epoch": 0.5142239327716259, "grad_norm": 0.4221421778202057, "learning_rate": 5.011437034219875e-06, "loss": 0.6615, "step": 2549 }, { "epoch": 0.5144256683278329, "grad_norm": 0.6879943013191223, "learning_rate": 5.0081693136463435e-06, "loss": 0.7909, "step": 2550 }, { "epoch": 0.5146274038840399, "grad_norm": 0.47413426637649536, "learning_rate": 5.004901589583524e-06, "loss": 0.649, "step": 2551 }, { "epoch": 0.5148291394402469, "grad_norm": 0.4079788625240326, "learning_rate": 5.0016338634271285e-06, "loss": 0.6717, "step": 2552 }, { "epoch": 0.5150308749964538, "grad_norm": 0.5021528601646423, "learning_rate": 4.998366136572874e-06, "loss": 0.6573, "step": 2553 }, { "epoch": 0.5152326105526609, "grad_norm": 0.5983197689056396, "learning_rate": 4.995098410416478e-06, "loss": 0.6964, "step": 2554 }, { "epoch": 0.5154343461088678, "grad_norm": 0.4596244990825653, "learning_rate": 4.9918306863536565e-06, "loss": 0.6676, "step": 2555 }, { "epoch": 0.5156360816650748, "grad_norm": 0.4215291142463684, "learning_rate": 4.988562965780127e-06, "loss": 0.8117, "step": 2556 }, { "epoch": 0.5158378172212819, "grad_norm": 1.1139490604400635, "learning_rate": 4.985295250091598e-06, "loss": 0.6803, "step": 2557 }, { "epoch": 0.5160395527774888, "grad_norm": 0.5091721415519714, "learning_rate": 4.982027540683785e-06, "loss": 0.6929, "step": 2558 }, { "epoch": 0.5162412883336959, "grad_norm": 0.6055331826210022, "learning_rate": 4.9787598389524e-06, "loss": 0.7099, "step": 2559 }, { "epoch": 0.5164430238899028, "grad_norm": 1.519207239151001, "learning_rate": 4.975492146293143e-06, "loss": 0.7387, "step": 2560 }, { "epoch": 0.5166447594461098, "grad_norm": 0.8859824538230896, "learning_rate": 4.97222446410172e-06, "loss": 0.8139, "step": 2561 }, { "epoch": 0.5168464950023168, "grad_norm": 1.6021531820297241, "learning_rate": 4.968956793773825e-06, "loss": 0.687, "step": 2562 }, { "epoch": 0.5170482305585238, "grad_norm": 7.087365627288818, "learning_rate": 4.965689136705153e-06, "loss": 0.7062, "step": 2563 }, { "epoch": 0.5172499661147308, "grad_norm": 0.7062385082244873, "learning_rate": 4.9624214942913916e-06, "loss": 0.699, "step": 2564 }, { "epoch": 0.5174517016709378, "grad_norm": 0.4900060296058655, "learning_rate": 4.959153867928218e-06, "loss": 0.7065, "step": 2565 }, { "epoch": 0.5176534372271447, "grad_norm": 0.656950056552887, "learning_rate": 4.955886259011308e-06, "loss": 0.67, "step": 2566 }, { "epoch": 0.5178551727833518, "grad_norm": 0.38856950402259827, "learning_rate": 4.95261866893633e-06, "loss": 0.6693, "step": 2567 }, { "epoch": 0.5180569083395588, "grad_norm": 0.4750538766384125, "learning_rate": 4.949351099098937e-06, "loss": 0.6941, "step": 2568 }, { "epoch": 0.5182586438957658, "grad_norm": 0.327970027923584, "learning_rate": 4.946083550894782e-06, "loss": 0.7079, "step": 2569 }, { "epoch": 0.5184603794519728, "grad_norm": 0.5009106397628784, "learning_rate": 4.942816025719505e-06, "loss": 0.7175, "step": 2570 }, { "epoch": 0.5186621150081797, "grad_norm": 0.7603925466537476, "learning_rate": 4.939548524968734e-06, "loss": 0.6647, "step": 2571 }, { "epoch": 0.5188638505643868, "grad_norm": 0.41649743914604187, "learning_rate": 4.936281050038091e-06, "loss": 0.6842, "step": 2572 }, { "epoch": 0.5190655861205937, "grad_norm": 0.43867188692092896, "learning_rate": 4.933013602323186e-06, "loss": 0.6821, "step": 2573 }, { "epoch": 0.5192673216768007, "grad_norm": 0.5117209553718567, "learning_rate": 4.929746183219615e-06, "loss": 0.7396, "step": 2574 }, { "epoch": 0.5194690572330077, "grad_norm": 1.3436833620071411, "learning_rate": 4.926478794122965e-06, "loss": 0.644, "step": 2575 }, { "epoch": 0.5196707927892147, "grad_norm": 1.052332878112793, "learning_rate": 4.923211436428804e-06, "loss": 0.6689, "step": 2576 }, { "epoch": 0.5198725283454217, "grad_norm": 0.5234437584877014, "learning_rate": 4.919944111532692e-06, "loss": 0.6491, "step": 2577 }, { "epoch": 0.5200742639016287, "grad_norm": 0.46440157294273376, "learning_rate": 4.91667682083018e-06, "loss": 0.7633, "step": 2578 }, { "epoch": 0.5202759994578356, "grad_norm": 0.45638078451156616, "learning_rate": 4.91340956571679e-06, "loss": 0.658, "step": 2579 }, { "epoch": 0.5204777350140427, "grad_norm": 0.34317252039909363, "learning_rate": 4.910142347588041e-06, "loss": 0.6906, "step": 2580 }, { "epoch": 0.5206794705702497, "grad_norm": 0.6732125282287598, "learning_rate": 4.906875167839433e-06, "loss": 0.9408, "step": 2581 }, { "epoch": 0.5208812061264567, "grad_norm": 0.6680876612663269, "learning_rate": 4.903608027866447e-06, "loss": 0.7812, "step": 2582 }, { "epoch": 0.5210829416826637, "grad_norm": 0.8660580515861511, "learning_rate": 4.90034092906455e-06, "loss": 1.1006, "step": 2583 }, { "epoch": 0.5212846772388706, "grad_norm": 0.478946715593338, "learning_rate": 4.89707387282919e-06, "loss": 0.8267, "step": 2584 }, { "epoch": 0.5214864127950777, "grad_norm": 0.3759441375732422, "learning_rate": 4.893806860555797e-06, "loss": 0.6477, "step": 2585 }, { "epoch": 0.5216881483512846, "grad_norm": 0.39136189222335815, "learning_rate": 4.890539893639782e-06, "loss": 0.6646, "step": 2586 }, { "epoch": 0.5218898839074917, "grad_norm": 0.4921991527080536, "learning_rate": 4.88727297347654e-06, "loss": 0.7019, "step": 2587 }, { "epoch": 0.5220916194636986, "grad_norm": 0.3825063407421112, "learning_rate": 4.884006101461438e-06, "loss": 0.8176, "step": 2588 }, { "epoch": 0.5222933550199056, "grad_norm": 0.3946991562843323, "learning_rate": 4.880739278989832e-06, "loss": 0.671, "step": 2589 }, { "epoch": 0.5224950905761127, "grad_norm": 0.5453450679779053, "learning_rate": 4.877472507457049e-06, "loss": 0.8733, "step": 2590 }, { "epoch": 0.5226968261323196, "grad_norm": 1.0942749977111816, "learning_rate": 4.874205788258397e-06, "loss": 0.843, "step": 2591 }, { "epoch": 0.5228985616885266, "grad_norm": 0.8049074411392212, "learning_rate": 4.870939122789167e-06, "loss": 0.6479, "step": 2592 }, { "epoch": 0.5231002972447336, "grad_norm": 0.5376644134521484, "learning_rate": 4.867672512444616e-06, "loss": 0.6544, "step": 2593 }, { "epoch": 0.5233020328009406, "grad_norm": 0.632375955581665, "learning_rate": 4.8644059586199885e-06, "loss": 0.682, "step": 2594 }, { "epoch": 0.5235037683571476, "grad_norm": 1.4146294593811035, "learning_rate": 4.8611394627105e-06, "loss": 0.6603, "step": 2595 }, { "epoch": 0.5237055039133546, "grad_norm": 0.33007383346557617, "learning_rate": 4.857873026111338e-06, "loss": 0.7352, "step": 2596 }, { "epoch": 0.5239072394695615, "grad_norm": 0.8308141231536865, "learning_rate": 4.854606650217668e-06, "loss": 0.7489, "step": 2597 }, { "epoch": 0.5241089750257686, "grad_norm": 0.9379283785820007, "learning_rate": 4.851340336424635e-06, "loss": 0.6454, "step": 2598 }, { "epoch": 0.5243107105819755, "grad_norm": 1.1256037950515747, "learning_rate": 4.848074086127345e-06, "loss": 0.6926, "step": 2599 }, { "epoch": 0.5245124461381826, "grad_norm": 0.892929196357727, "learning_rate": 4.844807900720888e-06, "loss": 0.6503, "step": 2600 }, { "epoch": 0.5247141816943895, "grad_norm": 0.7106110453605652, "learning_rate": 4.841541781600322e-06, "loss": 0.7273, "step": 2601 }, { "epoch": 0.5249159172505965, "grad_norm": 0.5377501249313354, "learning_rate": 4.838275730160675e-06, "loss": 0.7661, "step": 2602 }, { "epoch": 0.5251176528068036, "grad_norm": 0.4693327248096466, "learning_rate": 4.835009747796951e-06, "loss": 0.6407, "step": 2603 }, { "epoch": 0.5253193883630105, "grad_norm": 0.9355971813201904, "learning_rate": 4.831743835904117e-06, "loss": 0.6412, "step": 2604 }, { "epoch": 0.5255211239192176, "grad_norm": 0.6056692600250244, "learning_rate": 4.828477995877117e-06, "loss": 0.6361, "step": 2605 }, { "epoch": 0.5257228594754245, "grad_norm": 0.4959450364112854, "learning_rate": 4.825212229110864e-06, "loss": 0.8288, "step": 2606 }, { "epoch": 0.5259245950316315, "grad_norm": 0.8087174892425537, "learning_rate": 4.821946537000234e-06, "loss": 0.7336, "step": 2607 }, { "epoch": 0.5261263305878385, "grad_norm": 0.6815080642700195, "learning_rate": 4.818680920940074e-06, "loss": 0.6745, "step": 2608 }, { "epoch": 0.5263280661440455, "grad_norm": 0.4571034908294678, "learning_rate": 4.815415382325202e-06, "loss": 0.713, "step": 2609 }, { "epoch": 0.5265298017002524, "grad_norm": 0.361689031124115, "learning_rate": 4.8121499225503974e-06, "loss": 0.6651, "step": 2610 }, { "epoch": 0.5267315372564595, "grad_norm": 0.7336861491203308, "learning_rate": 4.808884543010409e-06, "loss": 0.6711, "step": 2611 }, { "epoch": 0.5269332728126664, "grad_norm": 0.6097553968429565, "learning_rate": 4.805619245099953e-06, "loss": 0.694, "step": 2612 }, { "epoch": 0.5271350083688735, "grad_norm": 0.3593609035015106, "learning_rate": 4.802354030213704e-06, "loss": 0.6593, "step": 2613 }, { "epoch": 0.5273367439250805, "grad_norm": 0.5035977363586426, "learning_rate": 4.7990888997463106e-06, "loss": 0.7097, "step": 2614 }, { "epoch": 0.5275384794812874, "grad_norm": 0.3543822169303894, "learning_rate": 4.795823855092379e-06, "loss": 0.7509, "step": 2615 }, { "epoch": 0.5277402150374945, "grad_norm": 0.6724951863288879, "learning_rate": 4.792558897646477e-06, "loss": 0.672, "step": 2616 }, { "epoch": 0.5279419505937014, "grad_norm": 0.35657966136932373, "learning_rate": 4.789294028803141e-06, "loss": 0.6623, "step": 2617 }, { "epoch": 0.5281436861499085, "grad_norm": 0.6644230484962463, "learning_rate": 4.786029249956866e-06, "loss": 0.6991, "step": 2618 }, { "epoch": 0.5283454217061154, "grad_norm": 0.40029600262641907, "learning_rate": 4.78276456250211e-06, "loss": 0.6563, "step": 2619 }, { "epoch": 0.5285471572623224, "grad_norm": 0.44226789474487305, "learning_rate": 4.779499967833292e-06, "loss": 0.6879, "step": 2620 }, { "epoch": 0.5287488928185294, "grad_norm": 0.4395109713077545, "learning_rate": 4.776235467344789e-06, "loss": 0.7705, "step": 2621 }, { "epoch": 0.5289506283747364, "grad_norm": 0.41959723830223083, "learning_rate": 4.772971062430941e-06, "loss": 0.6486, "step": 2622 }, { "epoch": 0.5291523639309434, "grad_norm": 0.5506021976470947, "learning_rate": 4.769706754486046e-06, "loss": 0.6839, "step": 2623 }, { "epoch": 0.5293540994871504, "grad_norm": 0.6641210913658142, "learning_rate": 4.766442544904357e-06, "loss": 0.7063, "step": 2624 }, { "epoch": 0.5295558350433573, "grad_norm": 0.9038758277893066, "learning_rate": 4.763178435080091e-06, "loss": 0.7615, "step": 2625 }, { "epoch": 0.5297575705995644, "grad_norm": 0.3876363933086395, "learning_rate": 4.759914426407422e-06, "loss": 0.6912, "step": 2626 }, { "epoch": 0.5299593061557714, "grad_norm": 0.3626331090927124, "learning_rate": 4.7566505202804736e-06, "loss": 0.6377, "step": 2627 }, { "epoch": 0.5301610417119783, "grad_norm": 8.561905860900879, "learning_rate": 4.7533867180933324e-06, "loss": 0.8409, "step": 2628 }, { "epoch": 0.5303627772681854, "grad_norm": 0.750117838382721, "learning_rate": 4.75012302124004e-06, "loss": 0.8042, "step": 2629 }, { "epoch": 0.5305645128243923, "grad_norm": 0.4639262557029724, "learning_rate": 4.746859431114589e-06, "loss": 0.8446, "step": 2630 }, { "epoch": 0.5307662483805994, "grad_norm": 0.44851747155189514, "learning_rate": 4.743595949110934e-06, "loss": 0.8183, "step": 2631 }, { "epoch": 0.5309679839368063, "grad_norm": 0.4516337811946869, "learning_rate": 4.7403325766229705e-06, "loss": 0.7315, "step": 2632 }, { "epoch": 0.5311697194930133, "grad_norm": 0.6819981932640076, "learning_rate": 4.737069315044562e-06, "loss": 0.7033, "step": 2633 }, { "epoch": 0.5313714550492203, "grad_norm": 0.4693869948387146, "learning_rate": 4.7338061657695164e-06, "loss": 0.7256, "step": 2634 }, { "epoch": 0.5315731906054273, "grad_norm": 0.5674824714660645, "learning_rate": 4.730543130191594e-06, "loss": 0.7296, "step": 2635 }, { "epoch": 0.5317749261616344, "grad_norm": 0.3459540009498596, "learning_rate": 4.727280209704507e-06, "loss": 0.659, "step": 2636 }, { "epoch": 0.5319766617178413, "grad_norm": 0.5303491353988647, "learning_rate": 4.7240174057019205e-06, "loss": 0.6834, "step": 2637 }, { "epoch": 0.5321783972740483, "grad_norm": 1.3827285766601562, "learning_rate": 4.720754719577448e-06, "loss": 0.7446, "step": 2638 }, { "epoch": 0.5323801328302553, "grad_norm": 0.5295196175575256, "learning_rate": 4.717492152724652e-06, "loss": 0.8191, "step": 2639 }, { "epoch": 0.5325818683864623, "grad_norm": 0.7308456301689148, "learning_rate": 4.714229706537048e-06, "loss": 0.6954, "step": 2640 }, { "epoch": 0.5327836039426693, "grad_norm": 0.5110881924629211, "learning_rate": 4.710967382408094e-06, "loss": 0.6741, "step": 2641 }, { "epoch": 0.5329853394988763, "grad_norm": 0.48731887340545654, "learning_rate": 4.707705181731202e-06, "loss": 0.7082, "step": 2642 }, { "epoch": 0.5331870750550832, "grad_norm": 0.35509830713272095, "learning_rate": 4.7044431058997245e-06, "loss": 0.6748, "step": 2643 }, { "epoch": 0.5333888106112903, "grad_norm": 0.7154707312583923, "learning_rate": 4.701181156306965e-06, "loss": 0.6625, "step": 2644 }, { "epoch": 0.5335905461674972, "grad_norm": 0.38529646396636963, "learning_rate": 4.697919334346177e-06, "loss": 0.8088, "step": 2645 }, { "epoch": 0.5337922817237043, "grad_norm": 0.45116767287254333, "learning_rate": 4.6946576414105485e-06, "loss": 0.7078, "step": 2646 }, { "epoch": 0.5339940172799112, "grad_norm": 0.43628716468811035, "learning_rate": 4.691396078893223e-06, "loss": 0.6913, "step": 2647 }, { "epoch": 0.5341957528361182, "grad_norm": 2.0100085735321045, "learning_rate": 4.6881346481872844e-06, "loss": 0.6698, "step": 2648 }, { "epoch": 0.5343974883923253, "grad_norm": 1.209532380104065, "learning_rate": 4.684873350685758e-06, "loss": 0.7494, "step": 2649 }, { "epoch": 0.5345992239485322, "grad_norm": 0.6590553522109985, "learning_rate": 4.681612187781614e-06, "loss": 0.6414, "step": 2650 }, { "epoch": 0.5348009595047392, "grad_norm": 0.7591909170150757, "learning_rate": 4.678351160867769e-06, "loss": 0.6677, "step": 2651 }, { "epoch": 0.5350026950609462, "grad_norm": 0.5568974614143372, "learning_rate": 4.675090271337072e-06, "loss": 0.7136, "step": 2652 }, { "epoch": 0.5352044306171532, "grad_norm": 1.6308541297912598, "learning_rate": 4.6718295205823235e-06, "loss": 0.769, "step": 2653 }, { "epoch": 0.5354061661733602, "grad_norm": 0.45534488558769226, "learning_rate": 4.668568909996263e-06, "loss": 0.735, "step": 2654 }, { "epoch": 0.5356079017295672, "grad_norm": 0.4230724275112152, "learning_rate": 4.66530844097156e-06, "loss": 0.7746, "step": 2655 }, { "epoch": 0.5358096372857741, "grad_norm": 1.834598183631897, "learning_rate": 4.662048114900837e-06, "loss": 0.6822, "step": 2656 }, { "epoch": 0.5360113728419812, "grad_norm": 0.5896740555763245, "learning_rate": 4.6587879331766465e-06, "loss": 0.6524, "step": 2657 }, { "epoch": 0.5362131083981881, "grad_norm": 0.9899855852127075, "learning_rate": 4.655527897191482e-06, "loss": 0.8368, "step": 2658 }, { "epoch": 0.5364148439543952, "grad_norm": 1.0821828842163086, "learning_rate": 4.652268008337779e-06, "loss": 0.8366, "step": 2659 }, { "epoch": 0.5366165795106022, "grad_norm": 0.3686734139919281, "learning_rate": 4.649008268007903e-06, "loss": 0.6939, "step": 2660 }, { "epoch": 0.5368183150668091, "grad_norm": 1.32119619846344, "learning_rate": 4.64574867759416e-06, "loss": 0.6831, "step": 2661 }, { "epoch": 0.5370200506230162, "grad_norm": 0.6293189525604248, "learning_rate": 4.642489238488794e-06, "loss": 0.6749, "step": 2662 }, { "epoch": 0.5372217861792231, "grad_norm": 0.7143060564994812, "learning_rate": 4.639229952083976e-06, "loss": 0.6889, "step": 2663 }, { "epoch": 0.5374235217354302, "grad_norm": 0.4893825352191925, "learning_rate": 4.635970819771822e-06, "loss": 0.6325, "step": 2664 }, { "epoch": 0.5376252572916371, "grad_norm": 0.44156011939048767, "learning_rate": 4.632711842944377e-06, "loss": 0.6458, "step": 2665 }, { "epoch": 0.5378269928478441, "grad_norm": 0.39792585372924805, "learning_rate": 4.629453022993618e-06, "loss": 0.7571, "step": 2666 }, { "epoch": 0.5380287284040511, "grad_norm": 0.3987579643726349, "learning_rate": 4.626194361311459e-06, "loss": 0.7626, "step": 2667 }, { "epoch": 0.5382304639602581, "grad_norm": 1.033350944519043, "learning_rate": 4.622935859289745e-06, "loss": 0.8932, "step": 2668 }, { "epoch": 0.538432199516465, "grad_norm": 0.35241934657096863, "learning_rate": 4.619677518320252e-06, "loss": 0.846, "step": 2669 }, { "epoch": 0.5386339350726721, "grad_norm": 0.35918742418289185, "learning_rate": 4.616419339794689e-06, "loss": 0.6199, "step": 2670 }, { "epoch": 0.538835670628879, "grad_norm": 0.774231493473053, "learning_rate": 4.613161325104691e-06, "loss": 0.6777, "step": 2671 }, { "epoch": 0.5390374061850861, "grad_norm": 0.6578600406646729, "learning_rate": 4.609903475641827e-06, "loss": 0.7652, "step": 2672 }, { "epoch": 0.5392391417412931, "grad_norm": 0.3553310036659241, "learning_rate": 4.606645792797599e-06, "loss": 0.675, "step": 2673 }, { "epoch": 0.5394408772975, "grad_norm": 0.7471398711204529, "learning_rate": 4.603388277963428e-06, "loss": 0.6971, "step": 2674 }, { "epoch": 0.5396426128537071, "grad_norm": 0.37425556778907776, "learning_rate": 4.600130932530671e-06, "loss": 0.6655, "step": 2675 }, { "epoch": 0.539844348409914, "grad_norm": 0.5239168405532837, "learning_rate": 4.596873757890612e-06, "loss": 0.6452, "step": 2676 }, { "epoch": 0.5400460839661211, "grad_norm": 0.6423122882843018, "learning_rate": 4.593616755434458e-06, "loss": 0.7154, "step": 2677 }, { "epoch": 0.540247819522328, "grad_norm": 0.9147586822509766, "learning_rate": 4.590359926553346e-06, "loss": 0.6945, "step": 2678 }, { "epoch": 0.540449555078535, "grad_norm": 0.5066952109336853, "learning_rate": 4.587103272638339e-06, "loss": 0.9241, "step": 2679 }, { "epoch": 0.540651290634742, "grad_norm": 0.5868780612945557, "learning_rate": 4.583846795080422e-06, "loss": 0.6798, "step": 2680 }, { "epoch": 0.540853026190949, "grad_norm": 0.3115403950214386, "learning_rate": 4.580590495270507e-06, "loss": 0.8002, "step": 2681 }, { "epoch": 0.541054761747156, "grad_norm": 1.4777621030807495, "learning_rate": 4.577334374599433e-06, "loss": 0.6667, "step": 2682 }, { "epoch": 0.541256497303363, "grad_norm": 0.6413019895553589, "learning_rate": 4.574078434457955e-06, "loss": 0.7311, "step": 2683 }, { "epoch": 0.54145823285957, "grad_norm": 0.40882059931755066, "learning_rate": 4.570822676236758e-06, "loss": 0.6541, "step": 2684 }, { "epoch": 0.541659968415777, "grad_norm": 0.6062789559364319, "learning_rate": 4.567567101326444e-06, "loss": 0.6362, "step": 2685 }, { "epoch": 0.541861703971984, "grad_norm": 0.42461323738098145, "learning_rate": 4.5643117111175405e-06, "loss": 0.673, "step": 2686 }, { "epoch": 0.5420634395281909, "grad_norm": 0.9184004068374634, "learning_rate": 4.5610565070004965e-06, "loss": 0.656, "step": 2687 }, { "epoch": 0.542265175084398, "grad_norm": 0.4189847409725189, "learning_rate": 4.557801490365678e-06, "loss": 0.6666, "step": 2688 }, { "epoch": 0.5424669106406049, "grad_norm": 0.4108622074127197, "learning_rate": 4.5545466626033715e-06, "loss": 0.6443, "step": 2689 }, { "epoch": 0.542668646196812, "grad_norm": 0.8937840461730957, "learning_rate": 4.551292025103789e-06, "loss": 0.708, "step": 2690 }, { "epoch": 0.5428703817530189, "grad_norm": 0.5205972790718079, "learning_rate": 4.548037579257051e-06, "loss": 0.6702, "step": 2691 }, { "epoch": 0.5430721173092259, "grad_norm": 0.6844760775566101, "learning_rate": 4.5447833264532015e-06, "loss": 0.6735, "step": 2692 }, { "epoch": 0.543273852865433, "grad_norm": 0.8154087662696838, "learning_rate": 4.541529268082208e-06, "loss": 0.6228, "step": 2693 }, { "epoch": 0.5434755884216399, "grad_norm": 0.631812572479248, "learning_rate": 4.538275405533943e-06, "loss": 0.7052, "step": 2694 }, { "epoch": 0.543677323977847, "grad_norm": 0.7014383673667908, "learning_rate": 4.535021740198202e-06, "loss": 0.7789, "step": 2695 }, { "epoch": 0.5438790595340539, "grad_norm": 0.7482444643974304, "learning_rate": 4.531768273464699e-06, "loss": 0.8676, "step": 2696 }, { "epoch": 0.5440807950902609, "grad_norm": 0.42839550971984863, "learning_rate": 4.5285150067230565e-06, "loss": 0.7038, "step": 2697 }, { "epoch": 0.5442825306464679, "grad_norm": 0.4500325620174408, "learning_rate": 4.525261941362818e-06, "loss": 0.6795, "step": 2698 }, { "epoch": 0.5444842662026749, "grad_norm": 5.416876792907715, "learning_rate": 4.522009078773433e-06, "loss": 0.714, "step": 2699 }, { "epoch": 0.5446860017588819, "grad_norm": 0.491653710603714, "learning_rate": 4.518756420344272e-06, "loss": 0.7071, "step": 2700 }, { "epoch": 0.5448877373150889, "grad_norm": 0.35450485348701477, "learning_rate": 4.515503967464619e-06, "loss": 0.6644, "step": 2701 }, { "epoch": 0.5450894728712958, "grad_norm": 0.4445685148239136, "learning_rate": 4.512251721523659e-06, "loss": 0.6412, "step": 2702 }, { "epoch": 0.5452912084275029, "grad_norm": 0.8250188827514648, "learning_rate": 4.508999683910503e-06, "loss": 0.6637, "step": 2703 }, { "epoch": 0.5454929439837098, "grad_norm": 0.38135024905204773, "learning_rate": 4.505747856014163e-06, "loss": 0.698, "step": 2704 }, { "epoch": 0.5456946795399168, "grad_norm": 0.35442817211151123, "learning_rate": 4.502496239223566e-06, "loss": 0.6717, "step": 2705 }, { "epoch": 0.5458964150961239, "grad_norm": 0.5252602100372314, "learning_rate": 4.499244834927547e-06, "loss": 0.8047, "step": 2706 }, { "epoch": 0.5460981506523308, "grad_norm": 0.40941813588142395, "learning_rate": 4.495993644514851e-06, "loss": 0.7421, "step": 2707 }, { "epoch": 0.5462998862085379, "grad_norm": 0.4588121175765991, "learning_rate": 4.492742669374133e-06, "loss": 0.6431, "step": 2708 }, { "epoch": 0.5465016217647448, "grad_norm": 0.4406556487083435, "learning_rate": 4.489491910893951e-06, "loss": 0.7046, "step": 2709 }, { "epoch": 0.5467033573209518, "grad_norm": 0.7718446850776672, "learning_rate": 4.486241370462779e-06, "loss": 0.6692, "step": 2710 }, { "epoch": 0.5469050928771588, "grad_norm": 0.38533902168273926, "learning_rate": 4.482991049468989e-06, "loss": 0.8709, "step": 2711 }, { "epoch": 0.5471068284333658, "grad_norm": 0.8942533135414124, "learning_rate": 4.479740949300864e-06, "loss": 0.6439, "step": 2712 }, { "epoch": 0.5473085639895728, "grad_norm": 0.5363048315048218, "learning_rate": 4.476491071346591e-06, "loss": 0.7975, "step": 2713 }, { "epoch": 0.5475102995457798, "grad_norm": 0.5082899928092957, "learning_rate": 4.473241416994265e-06, "loss": 0.7761, "step": 2714 }, { "epoch": 0.5477120351019867, "grad_norm": 0.4341282844543457, "learning_rate": 4.469991987631883e-06, "loss": 0.6799, "step": 2715 }, { "epoch": 0.5479137706581938, "grad_norm": 0.4084181785583496, "learning_rate": 4.466742784647344e-06, "loss": 0.7703, "step": 2716 }, { "epoch": 0.5481155062144007, "grad_norm": 0.4442834258079529, "learning_rate": 4.463493809428454e-06, "loss": 0.6482, "step": 2717 }, { "epoch": 0.5483172417706078, "grad_norm": 0.5705946087837219, "learning_rate": 4.460245063362925e-06, "loss": 0.6905, "step": 2718 }, { "epoch": 0.5485189773268148, "grad_norm": 0.30586788058280945, "learning_rate": 4.456996547838358e-06, "loss": 0.6811, "step": 2719 }, { "epoch": 0.5487207128830217, "grad_norm": 0.8047022223472595, "learning_rate": 4.4537482642422675e-06, "loss": 0.6551, "step": 2720 }, { "epoch": 0.5489224484392288, "grad_norm": 0.8891247510910034, "learning_rate": 4.450500213962069e-06, "loss": 0.7088, "step": 2721 }, { "epoch": 0.5491241839954357, "grad_norm": 1.040446400642395, "learning_rate": 4.447252398385071e-06, "loss": 0.933, "step": 2722 }, { "epoch": 0.5493259195516427, "grad_norm": 0.3285609483718872, "learning_rate": 4.444004818898484e-06, "loss": 0.713, "step": 2723 }, { "epoch": 0.5495276551078497, "grad_norm": 0.3531717360019684, "learning_rate": 4.440757476889424e-06, "loss": 0.6311, "step": 2724 }, { "epoch": 0.5497293906640567, "grad_norm": 0.35809552669525146, "learning_rate": 4.437510373744897e-06, "loss": 0.6997, "step": 2725 }, { "epoch": 0.5499311262202637, "grad_norm": 0.5928544998168945, "learning_rate": 4.4342635108518145e-06, "loss": 1.145, "step": 2726 }, { "epoch": 0.5501328617764707, "grad_norm": 1.4170804023742676, "learning_rate": 4.4310168895969755e-06, "loss": 0.8147, "step": 2727 }, { "epoch": 0.5503345973326776, "grad_norm": 0.4764517545700073, "learning_rate": 4.427770511367087e-06, "loss": 0.6892, "step": 2728 }, { "epoch": 0.5505363328888847, "grad_norm": 0.5142654180526733, "learning_rate": 4.424524377548747e-06, "loss": 0.8374, "step": 2729 }, { "epoch": 0.5507380684450917, "grad_norm": 0.4085748493671417, "learning_rate": 4.421278489528447e-06, "loss": 0.8751, "step": 2730 }, { "epoch": 0.5509398040012987, "grad_norm": 1.015724539756775, "learning_rate": 4.418032848692575e-06, "loss": 0.6493, "step": 2731 }, { "epoch": 0.5511415395575057, "grad_norm": 0.4627054035663605, "learning_rate": 4.414787456427419e-06, "loss": 0.8622, "step": 2732 }, { "epoch": 0.5513432751137126, "grad_norm": 0.35189080238342285, "learning_rate": 4.41154231411915e-06, "loss": 0.6559, "step": 2733 }, { "epoch": 0.5515450106699197, "grad_norm": 0.4287737011909485, "learning_rate": 4.408297423153841e-06, "loss": 0.6751, "step": 2734 }, { "epoch": 0.5517467462261266, "grad_norm": 0.7251502871513367, "learning_rate": 4.4050527849174555e-06, "loss": 0.7138, "step": 2735 }, { "epoch": 0.5519484817823337, "grad_norm": 1.1452211141586304, "learning_rate": 4.4018084007958475e-06, "loss": 0.7997, "step": 2736 }, { "epoch": 0.5521502173385406, "grad_norm": 0.4358304738998413, "learning_rate": 4.398564272174764e-06, "loss": 0.7486, "step": 2737 }, { "epoch": 0.5523519528947476, "grad_norm": 0.4466962516307831, "learning_rate": 4.3953204004398434e-06, "loss": 0.6605, "step": 2738 }, { "epoch": 0.5525536884509546, "grad_norm": 0.330139696598053, "learning_rate": 4.392076786976609e-06, "loss": 0.6841, "step": 2739 }, { "epoch": 0.5527554240071616, "grad_norm": 0.3641626536846161, "learning_rate": 4.388833433170482e-06, "loss": 0.6266, "step": 2740 }, { "epoch": 0.5529571595633687, "grad_norm": 0.3895452916622162, "learning_rate": 4.3855903404067665e-06, "loss": 0.6705, "step": 2741 }, { "epoch": 0.5531588951195756, "grad_norm": 0.3152562379837036, "learning_rate": 4.382347510070659e-06, "loss": 0.6698, "step": 2742 }, { "epoch": 0.5533606306757826, "grad_norm": 0.41445258259773254, "learning_rate": 4.379104943547242e-06, "loss": 0.7837, "step": 2743 }, { "epoch": 0.5535623662319896, "grad_norm": 0.3514803647994995, "learning_rate": 4.3758626422214836e-06, "loss": 0.6789, "step": 2744 }, { "epoch": 0.5537641017881966, "grad_norm": 0.3821863532066345, "learning_rate": 4.372620607478242e-06, "loss": 0.8194, "step": 2745 }, { "epoch": 0.5539658373444035, "grad_norm": 0.46488285064697266, "learning_rate": 4.369378840702263e-06, "loss": 0.6812, "step": 2746 }, { "epoch": 0.5541675729006106, "grad_norm": 0.7570337653160095, "learning_rate": 4.366137343278168e-06, "loss": 0.6632, "step": 2747 }, { "epoch": 0.5543693084568175, "grad_norm": 0.5926573872566223, "learning_rate": 4.362896116590475e-06, "loss": 0.9003, "step": 2748 }, { "epoch": 0.5545710440130246, "grad_norm": 0.6851587891578674, "learning_rate": 4.359655162023585e-06, "loss": 0.8732, "step": 2749 }, { "epoch": 0.5547727795692315, "grad_norm": 0.44598305225372314, "learning_rate": 4.356414480961773e-06, "loss": 1.022, "step": 2750 }, { "epoch": 0.5549745151254385, "grad_norm": 0.44037872552871704, "learning_rate": 4.353174074789207e-06, "loss": 0.6514, "step": 2751 }, { "epoch": 0.5551762506816456, "grad_norm": 0.5381491184234619, "learning_rate": 4.349933944889934e-06, "loss": 0.6869, "step": 2752 }, { "epoch": 0.5553779862378525, "grad_norm": 1.0382646322250366, "learning_rate": 4.346694092647883e-06, "loss": 0.6822, "step": 2753 }, { "epoch": 0.5555797217940596, "grad_norm": 0.8446259498596191, "learning_rate": 4.343454519446866e-06, "loss": 0.7246, "step": 2754 }, { "epoch": 0.5557814573502665, "grad_norm": 0.3245503902435303, "learning_rate": 4.340215226670572e-06, "loss": 0.7802, "step": 2755 }, { "epoch": 0.5559831929064735, "grad_norm": 0.41590356826782227, "learning_rate": 4.336976215702574e-06, "loss": 0.6587, "step": 2756 }, { "epoch": 0.5561849284626805, "grad_norm": 0.478512167930603, "learning_rate": 4.333737487926326e-06, "loss": 0.6288, "step": 2757 }, { "epoch": 0.5563866640188875, "grad_norm": 0.3754807114601135, "learning_rate": 4.330499044725154e-06, "loss": 0.6632, "step": 2758 }, { "epoch": 0.5565883995750945, "grad_norm": 0.34584978222846985, "learning_rate": 4.327260887482269e-06, "loss": 0.6748, "step": 2759 }, { "epoch": 0.5567901351313015, "grad_norm": 0.45819729566574097, "learning_rate": 4.324023017580759e-06, "loss": 0.701, "step": 2760 }, { "epoch": 0.5569918706875084, "grad_norm": 0.6208237409591675, "learning_rate": 4.320785436403585e-06, "loss": 0.6867, "step": 2761 }, { "epoch": 0.5571936062437155, "grad_norm": 0.4650062322616577, "learning_rate": 4.31754814533359e-06, "loss": 0.6537, "step": 2762 }, { "epoch": 0.5573953417999225, "grad_norm": 0.5537533760070801, "learning_rate": 4.3143111457534905e-06, "loss": 0.6722, "step": 2763 }, { "epoch": 0.5575970773561294, "grad_norm": 0.8163173794746399, "learning_rate": 4.311074439045878e-06, "loss": 0.714, "step": 2764 }, { "epoch": 0.5577988129123365, "grad_norm": 0.5011228322982788, "learning_rate": 4.30783802659322e-06, "loss": 0.6721, "step": 2765 }, { "epoch": 0.5580005484685434, "grad_norm": 0.4537436068058014, "learning_rate": 4.30460190977786e-06, "loss": 0.662, "step": 2766 }, { "epoch": 0.5582022840247505, "grad_norm": 0.5868704915046692, "learning_rate": 4.301366089982009e-06, "loss": 0.7899, "step": 2767 }, { "epoch": 0.5584040195809574, "grad_norm": 0.309226930141449, "learning_rate": 4.29813056858776e-06, "loss": 0.6806, "step": 2768 }, { "epoch": 0.5586057551371644, "grad_norm": 0.4315721392631531, "learning_rate": 4.2948953469770695e-06, "loss": 0.8418, "step": 2769 }, { "epoch": 0.5588074906933714, "grad_norm": 0.5131277441978455, "learning_rate": 4.291660426531773e-06, "loss": 0.7035, "step": 2770 }, { "epoch": 0.5590092262495784, "grad_norm": 0.44210201501846313, "learning_rate": 4.2884258086335755e-06, "loss": 0.6561, "step": 2771 }, { "epoch": 0.5592109618057854, "grad_norm": 0.49398237466812134, "learning_rate": 4.285191494664049e-06, "loss": 0.6542, "step": 2772 }, { "epoch": 0.5594126973619924, "grad_norm": 0.515116274356842, "learning_rate": 4.281957486004642e-06, "loss": 0.6596, "step": 2773 }, { "epoch": 0.5596144329181993, "grad_norm": 0.5115038752555847, "learning_rate": 4.278723784036667e-06, "loss": 0.739, "step": 2774 }, { "epoch": 0.5598161684744064, "grad_norm": 0.6491000056266785, "learning_rate": 4.275490390141309e-06, "loss": 0.6157, "step": 2775 }, { "epoch": 0.5600179040306134, "grad_norm": 0.33408376574516296, "learning_rate": 4.272257305699619e-06, "loss": 0.7125, "step": 2776 }, { "epoch": 0.5602196395868204, "grad_norm": 0.40321362018585205, "learning_rate": 4.26902453209252e-06, "loss": 0.6438, "step": 2777 }, { "epoch": 0.5604213751430274, "grad_norm": 0.5445034503936768, "learning_rate": 4.265792070700796e-06, "loss": 0.669, "step": 2778 }, { "epoch": 0.5606231106992343, "grad_norm": 0.43536266684532166, "learning_rate": 4.262559922905101e-06, "loss": 0.6817, "step": 2779 }, { "epoch": 0.5608248462554414, "grad_norm": 0.39567360281944275, "learning_rate": 4.259328090085958e-06, "loss": 0.7887, "step": 2780 }, { "epoch": 0.5610265818116483, "grad_norm": 0.8040300607681274, "learning_rate": 4.256096573623748e-06, "loss": 0.7735, "step": 2781 }, { "epoch": 0.5612283173678553, "grad_norm": 0.5638187527656555, "learning_rate": 4.252865374898726e-06, "loss": 0.8225, "step": 2782 }, { "epoch": 0.5614300529240623, "grad_norm": 0.39516234397888184, "learning_rate": 4.249634495291004e-06, "loss": 0.6774, "step": 2783 }, { "epoch": 0.5616317884802693, "grad_norm": 0.33776554465293884, "learning_rate": 4.24640393618056e-06, "loss": 0.6498, "step": 2784 }, { "epoch": 0.5618335240364764, "grad_norm": 0.43080976605415344, "learning_rate": 4.243173698947238e-06, "loss": 0.6616, "step": 2785 }, { "epoch": 0.5620352595926833, "grad_norm": 0.36215585470199585, "learning_rate": 4.239943784970738e-06, "loss": 0.6085, "step": 2786 }, { "epoch": 0.5622369951488903, "grad_norm": 0.40813156962394714, "learning_rate": 4.236714195630627e-06, "loss": 0.7525, "step": 2787 }, { "epoch": 0.5624387307050973, "grad_norm": 0.5586144924163818, "learning_rate": 4.233484932306337e-06, "loss": 0.8477, "step": 2788 }, { "epoch": 0.5626404662613043, "grad_norm": 0.32251301407814026, "learning_rate": 4.23025599637715e-06, "loss": 0.7107, "step": 2789 }, { "epoch": 0.5628422018175113, "grad_norm": 0.29961180686950684, "learning_rate": 4.227027389222215e-06, "loss": 0.9627, "step": 2790 }, { "epoch": 0.5630439373737183, "grad_norm": 0.7028844952583313, "learning_rate": 4.223799112220543e-06, "loss": 0.6415, "step": 2791 }, { "epoch": 0.5632456729299252, "grad_norm": 0.3743568956851959, "learning_rate": 4.2205711667509986e-06, "loss": 0.6619, "step": 2792 }, { "epoch": 0.5634474084861323, "grad_norm": 0.348414808511734, "learning_rate": 4.217343554192308e-06, "loss": 0.6367, "step": 2793 }, { "epoch": 0.5636491440423392, "grad_norm": 0.42004287242889404, "learning_rate": 4.214116275923051e-06, "loss": 0.6912, "step": 2794 }, { "epoch": 0.5638508795985463, "grad_norm": 0.450578510761261, "learning_rate": 4.210889333321668e-06, "loss": 0.7102, "step": 2795 }, { "epoch": 0.5640526151547532, "grad_norm": 0.36887484788894653, "learning_rate": 4.207662727766462e-06, "loss": 0.7953, "step": 2796 }, { "epoch": 0.5642543507109602, "grad_norm": 0.47906526923179626, "learning_rate": 4.204436460635578e-06, "loss": 0.6876, "step": 2797 }, { "epoch": 0.5644560862671673, "grad_norm": 0.3474639654159546, "learning_rate": 4.201210533307028e-06, "loss": 0.6823, "step": 2798 }, { "epoch": 0.5646578218233742, "grad_norm": 1.1059390306472778, "learning_rate": 4.1979849471586755e-06, "loss": 0.6545, "step": 2799 }, { "epoch": 0.5648595573795812, "grad_norm": 0.47281643748283386, "learning_rate": 4.1947597035682355e-06, "loss": 0.6701, "step": 2800 }, { "epoch": 0.5650612929357882, "grad_norm": 0.4572257995605469, "learning_rate": 4.191534803913281e-06, "loss": 0.6721, "step": 2801 }, { "epoch": 0.5652630284919952, "grad_norm": 0.39056044816970825, "learning_rate": 4.188310249571236e-06, "loss": 0.6467, "step": 2802 }, { "epoch": 0.5654647640482022, "grad_norm": 0.600731611251831, "learning_rate": 4.185086041919376e-06, "loss": 0.8154, "step": 2803 }, { "epoch": 0.5656664996044092, "grad_norm": 0.3447350561618805, "learning_rate": 4.18186218233483e-06, "loss": 0.7342, "step": 2804 }, { "epoch": 0.5658682351606161, "grad_norm": 1.207696795463562, "learning_rate": 4.178638672194582e-06, "loss": 0.6334, "step": 2805 }, { "epoch": 0.5660699707168232, "grad_norm": 0.6632784605026245, "learning_rate": 4.1754155128754545e-06, "loss": 0.6567, "step": 2806 }, { "epoch": 0.5662717062730301, "grad_norm": 0.4278070032596588, "learning_rate": 4.172192705754135e-06, "loss": 0.617, "step": 2807 }, { "epoch": 0.5664734418292372, "grad_norm": 0.9952390193939209, "learning_rate": 4.168970252207151e-06, "loss": 0.6974, "step": 2808 }, { "epoch": 0.5666751773854442, "grad_norm": 0.358598917722702, "learning_rate": 4.165748153610881e-06, "loss": 0.7966, "step": 2809 }, { "epoch": 0.5668769129416511, "grad_norm": 0.46342357993125916, "learning_rate": 4.1625264113415564e-06, "loss": 0.6444, "step": 2810 }, { "epoch": 0.5670786484978582, "grad_norm": 0.34472960233688354, "learning_rate": 4.159305026775249e-06, "loss": 0.664, "step": 2811 }, { "epoch": 0.5672803840540651, "grad_norm": 0.4725569188594818, "learning_rate": 4.156084001287883e-06, "loss": 0.6403, "step": 2812 }, { "epoch": 0.5674821196102722, "grad_norm": 0.34548354148864746, "learning_rate": 4.152863336255231e-06, "loss": 0.6669, "step": 2813 }, { "epoch": 0.5676838551664791, "grad_norm": 0.6782296299934387, "learning_rate": 4.149643033052902e-06, "loss": 0.7029, "step": 2814 }, { "epoch": 0.5678855907226861, "grad_norm": 0.38759613037109375, "learning_rate": 4.1464230930563595e-06, "loss": 0.6613, "step": 2815 }, { "epoch": 0.5680873262788931, "grad_norm": 0.37541335821151733, "learning_rate": 4.143203517640914e-06, "loss": 0.6692, "step": 2816 }, { "epoch": 0.5682890618351001, "grad_norm": 0.33899807929992676, "learning_rate": 4.1399843081817085e-06, "loss": 0.6809, "step": 2817 }, { "epoch": 0.568490797391307, "grad_norm": 0.36863020062446594, "learning_rate": 4.136765466053741e-06, "loss": 0.7819, "step": 2818 }, { "epoch": 0.5686925329475141, "grad_norm": 0.5838175415992737, "learning_rate": 4.133546992631847e-06, "loss": 0.6702, "step": 2819 }, { "epoch": 0.568894268503721, "grad_norm": 1.1761834621429443, "learning_rate": 4.130328889290705e-06, "loss": 0.8095, "step": 2820 }, { "epoch": 0.5690960040599281, "grad_norm": 0.6819433569908142, "learning_rate": 4.127111157404841e-06, "loss": 0.7284, "step": 2821 }, { "epoch": 0.5692977396161351, "grad_norm": 0.51947021484375, "learning_rate": 4.1238937983486085e-06, "loss": 0.6761, "step": 2822 }, { "epoch": 0.569499475172342, "grad_norm": 0.40705233812332153, "learning_rate": 4.120676813496219e-06, "loss": 0.7013, "step": 2823 }, { "epoch": 0.5697012107285491, "grad_norm": 0.3811868727207184, "learning_rate": 4.117460204221715e-06, "loss": 0.7444, "step": 2824 }, { "epoch": 0.569902946284756, "grad_norm": 0.6068965196609497, "learning_rate": 4.114243971898976e-06, "loss": 0.6504, "step": 2825 }, { "epoch": 0.5701046818409631, "grad_norm": 0.460427463054657, "learning_rate": 4.111028117901726e-06, "loss": 0.6779, "step": 2826 }, { "epoch": 0.57030641739717, "grad_norm": 0.31617289781570435, "learning_rate": 4.107812643603528e-06, "loss": 0.6728, "step": 2827 }, { "epoch": 0.570508152953377, "grad_norm": 0.48123976588249207, "learning_rate": 4.104597550377776e-06, "loss": 0.6909, "step": 2828 }, { "epoch": 0.570709888509584, "grad_norm": 0.3782070577144623, "learning_rate": 4.1013828395977075e-06, "loss": 0.6668, "step": 2829 }, { "epoch": 0.570911624065791, "grad_norm": 0.7894898653030396, "learning_rate": 4.098168512636397e-06, "loss": 0.8079, "step": 2830 }, { "epoch": 0.571113359621998, "grad_norm": 1.8633605241775513, "learning_rate": 4.094954570866748e-06, "loss": 0.6558, "step": 2831 }, { "epoch": 0.571315095178205, "grad_norm": 0.42616137862205505, "learning_rate": 4.0917410156615085e-06, "loss": 0.7138, "step": 2832 }, { "epoch": 0.571516830734412, "grad_norm": 0.5962389707565308, "learning_rate": 4.088527848393258e-06, "loss": 0.6977, "step": 2833 }, { "epoch": 0.571718566290619, "grad_norm": 0.4381890892982483, "learning_rate": 4.085315070434405e-06, "loss": 0.6546, "step": 2834 }, { "epoch": 0.571920301846826, "grad_norm": 0.4706403613090515, "learning_rate": 4.0821026831572e-06, "loss": 0.6852, "step": 2835 }, { "epoch": 0.5721220374030329, "grad_norm": 0.7240921854972839, "learning_rate": 4.078890687933719e-06, "loss": 0.6789, "step": 2836 }, { "epoch": 0.57232377295924, "grad_norm": 0.8354261517524719, "learning_rate": 4.075679086135877e-06, "loss": 0.6875, "step": 2837 }, { "epoch": 0.5725255085154469, "grad_norm": 0.6532930731773376, "learning_rate": 4.07246787913542e-06, "loss": 0.699, "step": 2838 }, { "epoch": 0.572727244071654, "grad_norm": 0.45959559082984924, "learning_rate": 4.06925706830392e-06, "loss": 0.6808, "step": 2839 }, { "epoch": 0.5729289796278609, "grad_norm": 0.461574912071228, "learning_rate": 4.066046655012786e-06, "loss": 0.6926, "step": 2840 }, { "epoch": 0.5731307151840679, "grad_norm": 0.33318328857421875, "learning_rate": 4.062836640633256e-06, "loss": 0.7878, "step": 2841 }, { "epoch": 0.573332450740275, "grad_norm": 0.4128797948360443, "learning_rate": 4.05962702653639e-06, "loss": 0.6467, "step": 2842 }, { "epoch": 0.5735341862964819, "grad_norm": 0.41282278299331665, "learning_rate": 4.056417814093089e-06, "loss": 0.6423, "step": 2843 }, { "epoch": 0.573735921852689, "grad_norm": 0.38421866297721863, "learning_rate": 4.053209004674079e-06, "loss": 0.6681, "step": 2844 }, { "epoch": 0.5739376574088959, "grad_norm": 0.4734240174293518, "learning_rate": 4.050000599649905e-06, "loss": 0.7335, "step": 2845 }, { "epoch": 0.5741393929651029, "grad_norm": 0.8877872228622437, "learning_rate": 4.046792600390948e-06, "loss": 0.7402, "step": 2846 }, { "epoch": 0.5743411285213099, "grad_norm": 0.407859742641449, "learning_rate": 4.043585008267418e-06, "loss": 0.856, "step": 2847 }, { "epoch": 0.5745428640775169, "grad_norm": 0.36701953411102295, "learning_rate": 4.040377824649341e-06, "loss": 0.6531, "step": 2848 }, { "epoch": 0.5747445996337239, "grad_norm": 0.32425543665885925, "learning_rate": 4.0371710509065775e-06, "loss": 0.7612, "step": 2849 }, { "epoch": 0.5749463351899309, "grad_norm": 0.5345488786697388, "learning_rate": 4.033964688408808e-06, "loss": 0.67, "step": 2850 }, { "epoch": 0.5751480707461378, "grad_norm": 0.37572526931762695, "learning_rate": 4.0307587385255395e-06, "loss": 0.6508, "step": 2851 }, { "epoch": 0.5753498063023449, "grad_norm": 0.5175938010215759, "learning_rate": 4.027553202626105e-06, "loss": 0.7204, "step": 2852 }, { "epoch": 0.5755515418585518, "grad_norm": 0.38628992438316345, "learning_rate": 4.0243480820796544e-06, "loss": 0.6626, "step": 2853 }, { "epoch": 0.5757532774147589, "grad_norm": 0.39906197786331177, "learning_rate": 4.021143378255164e-06, "loss": 0.8014, "step": 2854 }, { "epoch": 0.5759550129709659, "grad_norm": 0.5469081997871399, "learning_rate": 4.017939092521434e-06, "loss": 0.646, "step": 2855 }, { "epoch": 0.5761567485271728, "grad_norm": 0.4166348874568939, "learning_rate": 4.014735226247082e-06, "loss": 0.7024, "step": 2856 }, { "epoch": 0.5763584840833799, "grad_norm": 0.42324933409690857, "learning_rate": 4.011531780800549e-06, "loss": 0.6633, "step": 2857 }, { "epoch": 0.5765602196395868, "grad_norm": 0.3424137532711029, "learning_rate": 4.0083287575500965e-06, "loss": 0.636, "step": 2858 }, { "epoch": 0.5767619551957938, "grad_norm": 0.6285327076911926, "learning_rate": 4.005126157863803e-06, "loss": 0.8236, "step": 2859 }, { "epoch": 0.5769636907520008, "grad_norm": 0.5726880431175232, "learning_rate": 4.001923983109569e-06, "loss": 0.6326, "step": 2860 }, { "epoch": 0.5771654263082078, "grad_norm": 0.7701642513275146, "learning_rate": 3.998722234655113e-06, "loss": 0.6819, "step": 2861 }, { "epoch": 0.5773671618644148, "grad_norm": 1.3106602430343628, "learning_rate": 3.995520913867968e-06, "loss": 0.7423, "step": 2862 }, { "epoch": 0.5775688974206218, "grad_norm": 0.4269159138202667, "learning_rate": 3.992320022115492e-06, "loss": 0.646, "step": 2863 }, { "epoch": 0.5777706329768287, "grad_norm": 0.75742107629776, "learning_rate": 3.989119560764849e-06, "loss": 0.7595, "step": 2864 }, { "epoch": 0.5779723685330358, "grad_norm": 0.8314652442932129, "learning_rate": 3.985919531183029e-06, "loss": 0.6935, "step": 2865 }, { "epoch": 0.5781741040892427, "grad_norm": 0.7508164048194885, "learning_rate": 3.982719934736832e-06, "loss": 0.6716, "step": 2866 }, { "epoch": 0.5783758396454498, "grad_norm": 1.0965166091918945, "learning_rate": 3.979520772792875e-06, "loss": 0.676, "step": 2867 }, { "epoch": 0.5785775752016568, "grad_norm": 1.353140115737915, "learning_rate": 3.976322046717589e-06, "loss": 0.7143, "step": 2868 }, { "epoch": 0.5787793107578637, "grad_norm": 0.7627206444740295, "learning_rate": 3.973123757877219e-06, "loss": 0.8499, "step": 2869 }, { "epoch": 0.5789810463140708, "grad_norm": 0.46488526463508606, "learning_rate": 3.969925907637823e-06, "loss": 0.9204, "step": 2870 }, { "epoch": 0.5791827818702777, "grad_norm": 0.43276649713516235, "learning_rate": 3.966728497365272e-06, "loss": 0.7419, "step": 2871 }, { "epoch": 0.5793845174264848, "grad_norm": 0.3913387656211853, "learning_rate": 3.96353152842525e-06, "loss": 0.7284, "step": 2872 }, { "epoch": 0.5795862529826917, "grad_norm": 0.3979531526565552, "learning_rate": 3.9603350021832485e-06, "loss": 0.6834, "step": 2873 }, { "epoch": 0.5797879885388987, "grad_norm": 0.4591613709926605, "learning_rate": 3.9571389200045735e-06, "loss": 0.6881, "step": 2874 }, { "epoch": 0.5799897240951057, "grad_norm": 0.685820996761322, "learning_rate": 3.953943283254342e-06, "loss": 0.7704, "step": 2875 }, { "epoch": 0.5801914596513127, "grad_norm": 1.0198726654052734, "learning_rate": 3.950748093297479e-06, "loss": 0.7098, "step": 2876 }, { "epoch": 0.5803931952075196, "grad_norm": 0.4752916693687439, "learning_rate": 3.947553351498719e-06, "loss": 0.6746, "step": 2877 }, { "epoch": 0.5805949307637267, "grad_norm": 0.8071935176849365, "learning_rate": 3.9443590592226025e-06, "loss": 0.6935, "step": 2878 }, { "epoch": 0.5807966663199337, "grad_norm": 0.6545006036758423, "learning_rate": 3.941165217833484e-06, "loss": 0.65, "step": 2879 }, { "epoch": 0.5809984018761407, "grad_norm": 0.5821269750595093, "learning_rate": 3.937971828695522e-06, "loss": 0.8889, "step": 2880 }, { "epoch": 0.5812001374323477, "grad_norm": 0.6591540575027466, "learning_rate": 3.934778893172679e-06, "loss": 0.6388, "step": 2881 }, { "epoch": 0.5814018729885546, "grad_norm": 0.9008996486663818, "learning_rate": 3.931586412628727e-06, "loss": 0.6927, "step": 2882 }, { "epoch": 0.5816036085447617, "grad_norm": 0.7184357643127441, "learning_rate": 3.928394388427247e-06, "loss": 0.681, "step": 2883 }, { "epoch": 0.5818053441009686, "grad_norm": 0.41039785742759705, "learning_rate": 3.925202821931618e-06, "loss": 0.885, "step": 2884 }, { "epoch": 0.5820070796571757, "grad_norm": 0.9089972972869873, "learning_rate": 3.9220117145050254e-06, "loss": 0.685, "step": 2885 }, { "epoch": 0.5822088152133826, "grad_norm": 0.5806027054786682, "learning_rate": 3.918821067510464e-06, "loss": 0.6892, "step": 2886 }, { "epoch": 0.5824105507695896, "grad_norm": 1.8731859922409058, "learning_rate": 3.915630882310726e-06, "loss": 0.8295, "step": 2887 }, { "epoch": 0.5826122863257966, "grad_norm": 0.8789365887641907, "learning_rate": 3.912441160268407e-06, "loss": 0.6816, "step": 2888 }, { "epoch": 0.5828140218820036, "grad_norm": 0.4402933716773987, "learning_rate": 3.909251902745909e-06, "loss": 0.8881, "step": 2889 }, { "epoch": 0.5830157574382107, "grad_norm": 0.4205271005630493, "learning_rate": 3.90606311110543e-06, "loss": 0.6722, "step": 2890 }, { "epoch": 0.5832174929944176, "grad_norm": 0.39811578392982483, "learning_rate": 3.9028747867089735e-06, "loss": 0.7116, "step": 2891 }, { "epoch": 0.5834192285506246, "grad_norm": 0.36638522148132324, "learning_rate": 3.899686930918339e-06, "loss": 0.6378, "step": 2892 }, { "epoch": 0.5836209641068316, "grad_norm": 0.5602337718009949, "learning_rate": 3.89649954509513e-06, "loss": 0.6696, "step": 2893 }, { "epoch": 0.5838226996630386, "grad_norm": 0.37899842858314514, "learning_rate": 3.893312630600749e-06, "loss": 0.7303, "step": 2894 }, { "epoch": 0.5840244352192455, "grad_norm": 0.8907220363616943, "learning_rate": 3.890126188796393e-06, "loss": 0.6649, "step": 2895 }, { "epoch": 0.5842261707754526, "grad_norm": 1.020248532295227, "learning_rate": 3.8869402210430616e-06, "loss": 0.679, "step": 2896 }, { "epoch": 0.5844279063316595, "grad_norm": 0.6201417446136475, "learning_rate": 3.883754728701552e-06, "loss": 0.656, "step": 2897 }, { "epoch": 0.5846296418878666, "grad_norm": 0.3928530514240265, "learning_rate": 3.8805697131324525e-06, "loss": 0.9927, "step": 2898 }, { "epoch": 0.5848313774440735, "grad_norm": 0.3769809603691101, "learning_rate": 3.877385175696156e-06, "loss": 0.7605, "step": 2899 }, { "epoch": 0.5850331130002805, "grad_norm": 0.6660411953926086, "learning_rate": 3.874201117752846e-06, "loss": 0.6641, "step": 2900 }, { "epoch": 0.5852348485564876, "grad_norm": 0.41550612449645996, "learning_rate": 3.8710175406625e-06, "loss": 0.7132, "step": 2901 }, { "epoch": 0.5854365841126945, "grad_norm": 0.7428571581840515, "learning_rate": 3.867834445784893e-06, "loss": 0.6619, "step": 2902 }, { "epoch": 0.5856383196689016, "grad_norm": 0.9024825096130371, "learning_rate": 3.864651834479596e-06, "loss": 0.6359, "step": 2903 }, { "epoch": 0.5858400552251085, "grad_norm": 0.6011614799499512, "learning_rate": 3.861469708105969e-06, "loss": 0.6514, "step": 2904 }, { "epoch": 0.5860417907813155, "grad_norm": 0.4725446403026581, "learning_rate": 3.8582880680231675e-06, "loss": 0.7775, "step": 2905 }, { "epoch": 0.5862435263375225, "grad_norm": 0.5164721012115479, "learning_rate": 3.855106915590137e-06, "loss": 0.6267, "step": 2906 }, { "epoch": 0.5864452618937295, "grad_norm": 0.3842906951904297, "learning_rate": 3.851926252165616e-06, "loss": 0.6418, "step": 2907 }, { "epoch": 0.5866469974499365, "grad_norm": 0.5270214676856995, "learning_rate": 3.848746079108139e-06, "loss": 0.7971, "step": 2908 }, { "epoch": 0.5868487330061435, "grad_norm": 0.32814523577690125, "learning_rate": 3.845566397776022e-06, "loss": 0.7647, "step": 2909 }, { "epoch": 0.5870504685623504, "grad_norm": 1.0125082731246948, "learning_rate": 3.842387209527374e-06, "loss": 0.7725, "step": 2910 }, { "epoch": 0.5872522041185575, "grad_norm": 0.4370710253715515, "learning_rate": 3.839208515720102e-06, "loss": 0.6698, "step": 2911 }, { "epoch": 0.5874539396747644, "grad_norm": 0.3766273558139801, "learning_rate": 3.836030317711886e-06, "loss": 0.628, "step": 2912 }, { "epoch": 0.5876556752309714, "grad_norm": 0.5694656372070312, "learning_rate": 3.832852616860208e-06, "loss": 0.6578, "step": 2913 }, { "epoch": 0.5878574107871785, "grad_norm": 0.5162098407745361, "learning_rate": 3.829675414522332e-06, "loss": 0.691, "step": 2914 }, { "epoch": 0.5880591463433854, "grad_norm": 1.029689073562622, "learning_rate": 3.82649871205531e-06, "loss": 0.7757, "step": 2915 }, { "epoch": 0.5882608818995925, "grad_norm": 1.1007672548294067, "learning_rate": 3.8233225108159765e-06, "loss": 0.7941, "step": 2916 }, { "epoch": 0.5884626174557994, "grad_norm": 0.3063526451587677, "learning_rate": 3.82014681216096e-06, "loss": 0.632, "step": 2917 }, { "epoch": 0.5886643530120064, "grad_norm": 0.2978290319442749, "learning_rate": 3.8169716174466675e-06, "loss": 0.6563, "step": 2918 }, { "epoch": 0.5888660885682134, "grad_norm": 0.4835878312587738, "learning_rate": 3.813796928029295e-06, "loss": 0.6915, "step": 2919 }, { "epoch": 0.5890678241244204, "grad_norm": 0.3862401247024536, "learning_rate": 3.8106227452648175e-06, "loss": 0.6784, "step": 2920 }, { "epoch": 0.5892695596806274, "grad_norm": 0.44164806604385376, "learning_rate": 3.8074490705089983e-06, "loss": 0.6522, "step": 2921 }, { "epoch": 0.5894712952368344, "grad_norm": 0.5748169422149658, "learning_rate": 3.8042759051173843e-06, "loss": 0.6835, "step": 2922 }, { "epoch": 0.5896730307930413, "grad_norm": 0.5566649436950684, "learning_rate": 3.8011032504453e-06, "loss": 0.666, "step": 2923 }, { "epoch": 0.5898747663492484, "grad_norm": 0.5561574697494507, "learning_rate": 3.7979311078478554e-06, "loss": 0.7103, "step": 2924 }, { "epoch": 0.5900765019054554, "grad_norm": 0.4142785966396332, "learning_rate": 3.7947594786799424e-06, "loss": 0.8713, "step": 2925 }, { "epoch": 0.5902782374616624, "grad_norm": 1.048613429069519, "learning_rate": 3.7915883642962303e-06, "loss": 0.6412, "step": 2926 }, { "epoch": 0.5904799730178694, "grad_norm": 0.685620129108429, "learning_rate": 3.7884177660511713e-06, "loss": 0.6561, "step": 2927 }, { "epoch": 0.5906817085740763, "grad_norm": 0.43489912152290344, "learning_rate": 3.785247685298998e-06, "loss": 0.6759, "step": 2928 }, { "epoch": 0.5908834441302834, "grad_norm": 0.32432207465171814, "learning_rate": 3.7820781233937163e-06, "loss": 0.6417, "step": 2929 }, { "epoch": 0.5910851796864903, "grad_norm": 0.9676985144615173, "learning_rate": 3.7789090816891157e-06, "loss": 0.6837, "step": 2930 }, { "epoch": 0.5912869152426973, "grad_norm": 0.40368932485580444, "learning_rate": 3.7757405615387657e-06, "loss": 0.6609, "step": 2931 }, { "epoch": 0.5914886507989043, "grad_norm": 0.6651648879051208, "learning_rate": 3.7725725642960047e-06, "loss": 0.7401, "step": 2932 }, { "epoch": 0.5916903863551113, "grad_norm": 0.35699695348739624, "learning_rate": 3.7694050913139555e-06, "loss": 0.6083, "step": 2933 }, { "epoch": 0.5918921219113183, "grad_norm": 0.5533597469329834, "learning_rate": 3.7662381439455133e-06, "loss": 0.6542, "step": 2934 }, { "epoch": 0.5920938574675253, "grad_norm": 0.7516672611236572, "learning_rate": 3.763071723543349e-06, "loss": 0.8206, "step": 2935 }, { "epoch": 0.5922955930237322, "grad_norm": 0.5364474654197693, "learning_rate": 3.7599058314599112e-06, "loss": 0.6639, "step": 2936 }, { "epoch": 0.5924973285799393, "grad_norm": 0.753676176071167, "learning_rate": 3.756740469047416e-06, "loss": 0.6909, "step": 2937 }, { "epoch": 0.5926990641361463, "grad_norm": 0.8867906332015991, "learning_rate": 3.7535756376578625e-06, "loss": 0.7263, "step": 2938 }, { "epoch": 0.5929007996923533, "grad_norm": 0.46517348289489746, "learning_rate": 3.7504113386430187e-06, "loss": 0.7492, "step": 2939 }, { "epoch": 0.5931025352485603, "grad_norm": 3.5035691261291504, "learning_rate": 3.747247573354421e-06, "loss": 0.6719, "step": 2940 }, { "epoch": 0.5933042708047672, "grad_norm": 0.43149906396865845, "learning_rate": 3.744084343143383e-06, "loss": 0.6979, "step": 2941 }, { "epoch": 0.5935060063609743, "grad_norm": 0.405324250459671, "learning_rate": 3.740921649360991e-06, "loss": 0.6662, "step": 2942 }, { "epoch": 0.5937077419171812, "grad_norm": 0.40969419479370117, "learning_rate": 3.7377594933580967e-06, "loss": 0.6335, "step": 2943 }, { "epoch": 0.5939094774733883, "grad_norm": 0.3887074291706085, "learning_rate": 3.7345978764853276e-06, "loss": 0.6206, "step": 2944 }, { "epoch": 0.5941112130295952, "grad_norm": 0.33942726254463196, "learning_rate": 3.7314368000930754e-06, "loss": 0.6788, "step": 2945 }, { "epoch": 0.5943129485858022, "grad_norm": 0.8974020481109619, "learning_rate": 3.7282762655315065e-06, "loss": 0.6798, "step": 2946 }, { "epoch": 0.5945146841420093, "grad_norm": 0.4469338655471802, "learning_rate": 3.7251162741505543e-06, "loss": 0.6575, "step": 2947 }, { "epoch": 0.5947164196982162, "grad_norm": 0.5734437704086304, "learning_rate": 3.7219568272999148e-06, "loss": 0.7013, "step": 2948 }, { "epoch": 0.5949181552544233, "grad_norm": 0.38166043162345886, "learning_rate": 3.7187979263290585e-06, "loss": 0.6895, "step": 2949 }, { "epoch": 0.5951198908106302, "grad_norm": 0.32561951875686646, "learning_rate": 3.7156395725872213e-06, "loss": 0.7999, "step": 2950 }, { "epoch": 0.5953216263668372, "grad_norm": 0.3810766637325287, "learning_rate": 3.712481767423402e-06, "loss": 0.7402, "step": 2951 }, { "epoch": 0.5955233619230442, "grad_norm": 0.3509414494037628, "learning_rate": 3.7093245121863673e-06, "loss": 0.6636, "step": 2952 }, { "epoch": 0.5957250974792512, "grad_norm": 0.5389799475669861, "learning_rate": 3.706167808224652e-06, "loss": 0.7185, "step": 2953 }, { "epoch": 0.5959268330354581, "grad_norm": 1.3201884031295776, "learning_rate": 3.7030116568865486e-06, "loss": 0.7929, "step": 2954 }, { "epoch": 0.5961285685916652, "grad_norm": 0.3962297737598419, "learning_rate": 3.6998560595201188e-06, "loss": 0.6616, "step": 2955 }, { "epoch": 0.5963303041478721, "grad_norm": 0.8047513961791992, "learning_rate": 3.696701017473189e-06, "loss": 0.6579, "step": 2956 }, { "epoch": 0.5965320397040792, "grad_norm": 0.3982381522655487, "learning_rate": 3.6935465320933393e-06, "loss": 0.6751, "step": 2957 }, { "epoch": 0.5967337752602861, "grad_norm": 0.3599437475204468, "learning_rate": 3.6903926047279254e-06, "loss": 0.6433, "step": 2958 }, { "epoch": 0.5969355108164931, "grad_norm": 0.8588778972625732, "learning_rate": 3.6872392367240523e-06, "loss": 0.8927, "step": 2959 }, { "epoch": 0.5971372463727002, "grad_norm": 0.38870593905448914, "learning_rate": 3.684086429428594e-06, "loss": 0.6456, "step": 2960 }, { "epoch": 0.5973389819289071, "grad_norm": 1.5577501058578491, "learning_rate": 3.680934184188182e-06, "loss": 0.7435, "step": 2961 }, { "epoch": 0.5975407174851142, "grad_norm": 1.114391803741455, "learning_rate": 3.6777825023492076e-06, "loss": 0.8232, "step": 2962 }, { "epoch": 0.5977424530413211, "grad_norm": 0.6805806756019592, "learning_rate": 3.6746313852578226e-06, "loss": 0.6773, "step": 2963 }, { "epoch": 0.5979441885975281, "grad_norm": 0.6296008229255676, "learning_rate": 3.671480834259939e-06, "loss": 0.6876, "step": 2964 }, { "epoch": 0.5981459241537351, "grad_norm": 1.0204540491104126, "learning_rate": 3.6683308507012196e-06, "loss": 0.6512, "step": 2965 }, { "epoch": 0.5983476597099421, "grad_norm": 0.6919869184494019, "learning_rate": 3.6651814359270955e-06, "loss": 0.6719, "step": 2966 }, { "epoch": 0.5985493952661491, "grad_norm": 1.0824942588806152, "learning_rate": 3.6620325912827493e-06, "loss": 0.848, "step": 2967 }, { "epoch": 0.5987511308223561, "grad_norm": 0.3988608419895172, "learning_rate": 3.658884318113117e-06, "loss": 0.6716, "step": 2968 }, { "epoch": 0.598952866378563, "grad_norm": 1.047755479812622, "learning_rate": 3.6557366177628956e-06, "loss": 0.6931, "step": 2969 }, { "epoch": 0.5991546019347701, "grad_norm": 0.7173445224761963, "learning_rate": 3.652589491576537e-06, "loss": 0.6335, "step": 2970 }, { "epoch": 0.599356337490977, "grad_norm": 1.1091235876083374, "learning_rate": 3.6494429408982446e-06, "loss": 0.622, "step": 2971 }, { "epoch": 0.599558073047184, "grad_norm": 0.8816885352134705, "learning_rate": 3.6462969670719807e-06, "loss": 0.6661, "step": 2972 }, { "epoch": 0.5997598086033911, "grad_norm": 1.5705469846725464, "learning_rate": 3.6431515714414552e-06, "loss": 0.6991, "step": 2973 }, { "epoch": 0.599961544159598, "grad_norm": 0.774057924747467, "learning_rate": 3.6400067553501362e-06, "loss": 0.6694, "step": 2974 }, { "epoch": 0.6001632797158051, "grad_norm": 0.43345385789871216, "learning_rate": 3.6368625201412443e-06, "loss": 0.8072, "step": 2975 }, { "epoch": 0.600365015272012, "grad_norm": 0.3652766942977905, "learning_rate": 3.6337188671577463e-06, "loss": 0.6596, "step": 2976 }, { "epoch": 0.600566750828219, "grad_norm": 0.3644733428955078, "learning_rate": 3.630575797742365e-06, "loss": 0.6502, "step": 2977 }, { "epoch": 0.600768486384426, "grad_norm": 0.8941843509674072, "learning_rate": 3.627433313237576e-06, "loss": 0.6439, "step": 2978 }, { "epoch": 0.600970221940633, "grad_norm": 0.9603281021118164, "learning_rate": 3.6242914149855984e-06, "loss": 0.6655, "step": 2979 }, { "epoch": 0.60117195749684, "grad_norm": 0.37791603803634644, "learning_rate": 3.621150104328407e-06, "loss": 0.7303, "step": 2980 }, { "epoch": 0.601373693053047, "grad_norm": 0.862527072429657, "learning_rate": 3.6180093826077236e-06, "loss": 0.6355, "step": 2981 }, { "epoch": 0.601575428609254, "grad_norm": 0.4360196888446808, "learning_rate": 3.614869251165015e-06, "loss": 0.6691, "step": 2982 }, { "epoch": 0.601777164165461, "grad_norm": 0.38961103558540344, "learning_rate": 3.611729711341503e-06, "loss": 0.6645, "step": 2983 }, { "epoch": 0.601978899721668, "grad_norm": 0.6375663876533508, "learning_rate": 3.6085907644781522e-06, "loss": 0.6759, "step": 2984 }, { "epoch": 0.602180635277875, "grad_norm": 0.6156184673309326, "learning_rate": 3.6054524119156696e-06, "loss": 0.6485, "step": 2985 }, { "epoch": 0.602382370834082, "grad_norm": 0.4622036814689636, "learning_rate": 3.602314654994521e-06, "loss": 0.6766, "step": 2986 }, { "epoch": 0.6025841063902889, "grad_norm": 0.43395689129829407, "learning_rate": 3.599177495054903e-06, "loss": 0.7704, "step": 2987 }, { "epoch": 0.602785841946496, "grad_norm": 0.3791487216949463, "learning_rate": 3.5960409334367676e-06, "loss": 0.6537, "step": 2988 }, { "epoch": 0.6029875775027029, "grad_norm": 0.4992687702178955, "learning_rate": 3.592904971479808e-06, "loss": 0.6854, "step": 2989 }, { "epoch": 0.6031893130589099, "grad_norm": 0.3665289878845215, "learning_rate": 3.589769610523459e-06, "loss": 0.6498, "step": 2990 }, { "epoch": 0.6033910486151169, "grad_norm": 0.6164460778236389, "learning_rate": 3.5866348519069034e-06, "loss": 0.6605, "step": 2991 }, { "epoch": 0.6035927841713239, "grad_norm": 0.39422157406806946, "learning_rate": 3.5835006969690634e-06, "loss": 0.6493, "step": 2992 }, { "epoch": 0.603794519727531, "grad_norm": 0.6252549886703491, "learning_rate": 3.5803671470486023e-06, "loss": 0.6322, "step": 2993 }, { "epoch": 0.6039962552837379, "grad_norm": 1.0483990907669067, "learning_rate": 3.5772342034839293e-06, "loss": 0.6724, "step": 2994 }, { "epoch": 0.6041979908399449, "grad_norm": 1.0793009996414185, "learning_rate": 3.574101867613192e-06, "loss": 0.6607, "step": 2995 }, { "epoch": 0.6043997263961519, "grad_norm": 0.4432700276374817, "learning_rate": 3.570970140774277e-06, "loss": 0.6635, "step": 2996 }, { "epoch": 0.6046014619523589, "grad_norm": 0.41216427087783813, "learning_rate": 3.567839024304812e-06, "loss": 0.616, "step": 2997 }, { "epoch": 0.6048031975085659, "grad_norm": 0.9142566323280334, "learning_rate": 3.5647085195421668e-06, "loss": 0.6757, "step": 2998 }, { "epoch": 0.6050049330647729, "grad_norm": 0.3180692791938782, "learning_rate": 3.5615786278234443e-06, "loss": 0.785, "step": 2999 }, { "epoch": 0.6052066686209798, "grad_norm": 2.0407209396362305, "learning_rate": 3.5584493504854924e-06, "loss": 0.6865, "step": 3000 }, { "epoch": 0.6054084041771869, "grad_norm": 0.601322591304779, "learning_rate": 3.555320688864889e-06, "loss": 0.7127, "step": 3001 }, { "epoch": 0.6056101397333938, "grad_norm": 0.4150068163871765, "learning_rate": 3.552192644297955e-06, "loss": 0.7162, "step": 3002 }, { "epoch": 0.6058118752896009, "grad_norm": 1.607946515083313, "learning_rate": 3.5490652181207474e-06, "loss": 0.6517, "step": 3003 }, { "epoch": 0.6060136108458078, "grad_norm": 3.0131125450134277, "learning_rate": 3.545938411669053e-06, "loss": 0.8067, "step": 3004 }, { "epoch": 0.6062153464020148, "grad_norm": 0.8337088227272034, "learning_rate": 3.5428122262784005e-06, "loss": 0.7968, "step": 3005 }, { "epoch": 0.6064170819582219, "grad_norm": 1.514702320098877, "learning_rate": 3.539686663284053e-06, "loss": 0.6373, "step": 3006 }, { "epoch": 0.6066188175144288, "grad_norm": 1.8276013135910034, "learning_rate": 3.536561724021003e-06, "loss": 0.7206, "step": 3007 }, { "epoch": 0.6068205530706358, "grad_norm": 0.4989999532699585, "learning_rate": 3.5334374098239797e-06, "loss": 0.6142, "step": 3008 }, { "epoch": 0.6070222886268428, "grad_norm": 0.3233503997325897, "learning_rate": 3.5303137220274467e-06, "loss": 0.6769, "step": 3009 }, { "epoch": 0.6072240241830498, "grad_norm": 0.4093765616416931, "learning_rate": 3.5271906619655966e-06, "loss": 0.6632, "step": 3010 }, { "epoch": 0.6074257597392568, "grad_norm": 0.8114385604858398, "learning_rate": 3.524068230972356e-06, "loss": 0.6268, "step": 3011 }, { "epoch": 0.6076274952954638, "grad_norm": 0.4605998396873474, "learning_rate": 3.5209464303813843e-06, "loss": 0.684, "step": 3012 }, { "epoch": 0.6078292308516707, "grad_norm": 0.4391426742076874, "learning_rate": 3.5178252615260677e-06, "loss": 0.681, "step": 3013 }, { "epoch": 0.6080309664078778, "grad_norm": 0.3784283697605133, "learning_rate": 3.5147047257395268e-06, "loss": 0.8089, "step": 3014 }, { "epoch": 0.6082327019640847, "grad_norm": 0.5586382746696472, "learning_rate": 3.5115848243546065e-06, "loss": 0.6608, "step": 3015 }, { "epoch": 0.6084344375202918, "grad_norm": 0.5277711749076843, "learning_rate": 3.508465558703885e-06, "loss": 0.7175, "step": 3016 }, { "epoch": 0.6086361730764988, "grad_norm": 0.420762836933136, "learning_rate": 3.505346930119671e-06, "loss": 0.6385, "step": 3017 }, { "epoch": 0.6088379086327057, "grad_norm": 0.3296513557434082, "learning_rate": 3.5022289399339933e-06, "loss": 0.7085, "step": 3018 }, { "epoch": 0.6090396441889128, "grad_norm": 0.41073331236839294, "learning_rate": 3.4991115894786152e-06, "loss": 0.737, "step": 3019 }, { "epoch": 0.6092413797451197, "grad_norm": 0.29459112882614136, "learning_rate": 3.4959948800850253e-06, "loss": 0.661, "step": 3020 }, { "epoch": 0.6094431153013268, "grad_norm": 0.49085837602615356, "learning_rate": 3.492878813084435e-06, "loss": 0.674, "step": 3021 }, { "epoch": 0.6096448508575337, "grad_norm": 0.427566796541214, "learning_rate": 3.489763389807784e-06, "loss": 0.6497, "step": 3022 }, { "epoch": 0.6098465864137407, "grad_norm": 0.8970413208007812, "learning_rate": 3.4866486115857407e-06, "loss": 0.707, "step": 3023 }, { "epoch": 0.6100483219699477, "grad_norm": 0.3847060799598694, "learning_rate": 3.483534479748688e-06, "loss": 0.6836, "step": 3024 }, { "epoch": 0.6102500575261547, "grad_norm": 1.063359022140503, "learning_rate": 3.480420995626741e-06, "loss": 0.6425, "step": 3025 }, { "epoch": 0.6104517930823616, "grad_norm": 0.6723124384880066, "learning_rate": 3.4773081605497393e-06, "loss": 0.7339, "step": 3026 }, { "epoch": 0.6106535286385687, "grad_norm": 0.41069871187210083, "learning_rate": 3.4741959758472367e-06, "loss": 0.6732, "step": 3027 }, { "epoch": 0.6108552641947756, "grad_norm": 0.608799934387207, "learning_rate": 3.4710844428485176e-06, "loss": 0.6761, "step": 3028 }, { "epoch": 0.6110569997509827, "grad_norm": 0.4474925398826599, "learning_rate": 3.4679735628825826e-06, "loss": 0.6193, "step": 3029 }, { "epoch": 0.6112587353071897, "grad_norm": 0.669126570224762, "learning_rate": 3.464863337278157e-06, "loss": 0.6864, "step": 3030 }, { "epoch": 0.6114604708633966, "grad_norm": 0.3941359519958496, "learning_rate": 3.461753767363687e-06, "loss": 0.6738, "step": 3031 }, { "epoch": 0.6116622064196037, "grad_norm": 0.45722004771232605, "learning_rate": 3.458644854467331e-06, "loss": 0.6721, "step": 3032 }, { "epoch": 0.6118639419758106, "grad_norm": 0.28656086325645447, "learning_rate": 3.455536599916979e-06, "loss": 0.6457, "step": 3033 }, { "epoch": 0.6120656775320177, "grad_norm": 0.7220064401626587, "learning_rate": 3.452429005040232e-06, "loss": 0.6899, "step": 3034 }, { "epoch": 0.6122674130882246, "grad_norm": 0.6459056735038757, "learning_rate": 3.449322071164408e-06, "loss": 0.664, "step": 3035 }, { "epoch": 0.6124691486444316, "grad_norm": 0.4235333502292633, "learning_rate": 3.446215799616548e-06, "loss": 0.7266, "step": 3036 }, { "epoch": 0.6126708842006386, "grad_norm": 0.3836830258369446, "learning_rate": 3.443110191723407e-06, "loss": 0.7827, "step": 3037 }, { "epoch": 0.6128726197568456, "grad_norm": 0.5448535084724426, "learning_rate": 3.440005248811457e-06, "loss": 0.7843, "step": 3038 }, { "epoch": 0.6130743553130527, "grad_norm": 0.44550132751464844, "learning_rate": 3.4369009722068846e-06, "loss": 0.6652, "step": 3039 }, { "epoch": 0.6132760908692596, "grad_norm": 0.4314553439617157, "learning_rate": 3.4337973632355958e-06, "loss": 0.6526, "step": 3040 }, { "epoch": 0.6134778264254666, "grad_norm": 0.5170155167579651, "learning_rate": 3.4306944232232065e-06, "loss": 0.6846, "step": 3041 }, { "epoch": 0.6136795619816736, "grad_norm": 0.6112193465232849, "learning_rate": 3.427592153495053e-06, "loss": 0.6449, "step": 3042 }, { "epoch": 0.6138812975378806, "grad_norm": 0.846665620803833, "learning_rate": 3.424490555376176e-06, "loss": 0.7245, "step": 3043 }, { "epoch": 0.6140830330940876, "grad_norm": 0.38036710023880005, "learning_rate": 3.421389630191338e-06, "loss": 0.7464, "step": 3044 }, { "epoch": 0.6142847686502946, "grad_norm": 0.4693972170352936, "learning_rate": 3.4182893792650117e-06, "loss": 0.7562, "step": 3045 }, { "epoch": 0.6144865042065015, "grad_norm": 1.0809375047683716, "learning_rate": 3.41518980392138e-06, "loss": 0.6967, "step": 3046 }, { "epoch": 0.6146882397627086, "grad_norm": 0.3993861973285675, "learning_rate": 3.4120909054843375e-06, "loss": 0.6257, "step": 3047 }, { "epoch": 0.6148899753189155, "grad_norm": 1.0259984731674194, "learning_rate": 3.4089926852774934e-06, "loss": 0.6815, "step": 3048 }, { "epoch": 0.6150917108751225, "grad_norm": 1.449548363685608, "learning_rate": 3.4058951446241604e-06, "loss": 0.6787, "step": 3049 }, { "epoch": 0.6152934464313295, "grad_norm": 0.5463994741439819, "learning_rate": 3.402798284847368e-06, "loss": 0.6876, "step": 3050 }, { "epoch": 0.6154951819875365, "grad_norm": 1.2420287132263184, "learning_rate": 3.3997021072698524e-06, "loss": 0.6332, "step": 3051 }, { "epoch": 0.6156969175437436, "grad_norm": 0.3920036554336548, "learning_rate": 3.396606613214053e-06, "loss": 0.8062, "step": 3052 }, { "epoch": 0.6158986530999505, "grad_norm": 0.4213649034500122, "learning_rate": 3.3935118040021255e-06, "loss": 0.6945, "step": 3053 }, { "epoch": 0.6161003886561575, "grad_norm": 1.0378633737564087, "learning_rate": 3.390417680955931e-06, "loss": 0.855, "step": 3054 }, { "epoch": 0.6163021242123645, "grad_norm": 0.718075692653656, "learning_rate": 3.387324245397032e-06, "loss": 0.8364, "step": 3055 }, { "epoch": 0.6165038597685715, "grad_norm": 0.8293585777282715, "learning_rate": 3.384231498646706e-06, "loss": 0.6713, "step": 3056 }, { "epoch": 0.6167055953247785, "grad_norm": 0.3795190155506134, "learning_rate": 3.381139442025928e-06, "loss": 0.8182, "step": 3057 }, { "epoch": 0.6169073308809855, "grad_norm": 0.36966368556022644, "learning_rate": 3.3780480768553834e-06, "loss": 0.6691, "step": 3058 }, { "epoch": 0.6171090664371924, "grad_norm": 0.6272523403167725, "learning_rate": 3.374957404455464e-06, "loss": 0.6929, "step": 3059 }, { "epoch": 0.6173108019933995, "grad_norm": 0.32975462079048157, "learning_rate": 3.371867426146256e-06, "loss": 0.7372, "step": 3060 }, { "epoch": 0.6175125375496064, "grad_norm": 0.42231565713882446, "learning_rate": 3.368778143247561e-06, "loss": 0.7876, "step": 3061 }, { "epoch": 0.6177142731058135, "grad_norm": 0.37638795375823975, "learning_rate": 3.3656895570788778e-06, "loss": 0.691, "step": 3062 }, { "epoch": 0.6179160086620205, "grad_norm": 0.4994615316390991, "learning_rate": 3.3626016689594053e-06, "loss": 0.7033, "step": 3063 }, { "epoch": 0.6181177442182274, "grad_norm": 0.4467025697231293, "learning_rate": 3.3595144802080493e-06, "loss": 0.6819, "step": 3064 }, { "epoch": 0.6183194797744345, "grad_norm": 0.3670813739299774, "learning_rate": 3.356427992143415e-06, "loss": 0.8562, "step": 3065 }, { "epoch": 0.6185212153306414, "grad_norm": 0.5851613879203796, "learning_rate": 3.3533422060838056e-06, "loss": 0.8069, "step": 3066 }, { "epoch": 0.6187229508868484, "grad_norm": 0.624320387840271, "learning_rate": 3.350257123347229e-06, "loss": 0.6733, "step": 3067 }, { "epoch": 0.6189246864430554, "grad_norm": 0.5927186608314514, "learning_rate": 3.34717274525139e-06, "loss": 0.6508, "step": 3068 }, { "epoch": 0.6191264219992624, "grad_norm": 1.0810381174087524, "learning_rate": 3.3440890731136925e-06, "loss": 0.7074, "step": 3069 }, { "epoch": 0.6193281575554694, "grad_norm": 0.3897158205509186, "learning_rate": 3.3410061082512422e-06, "loss": 0.7887, "step": 3070 }, { "epoch": 0.6195298931116764, "grad_norm": 0.9414277076721191, "learning_rate": 3.337923851980834e-06, "loss": 0.8225, "step": 3071 }, { "epoch": 0.6197316286678833, "grad_norm": 0.43071287870407104, "learning_rate": 3.3348423056189705e-06, "loss": 0.6808, "step": 3072 }, { "epoch": 0.6199333642240904, "grad_norm": 0.4197927713394165, "learning_rate": 3.331761470481846e-06, "loss": 0.9375, "step": 3073 }, { "epoch": 0.6201350997802973, "grad_norm": 0.4770767390727997, "learning_rate": 3.3286813478853495e-06, "loss": 0.6511, "step": 3074 }, { "epoch": 0.6203368353365044, "grad_norm": 0.47283339500427246, "learning_rate": 3.3256019391450696e-06, "loss": 0.6466, "step": 3075 }, { "epoch": 0.6205385708927114, "grad_norm": 0.42882150411605835, "learning_rate": 3.3225232455762885e-06, "loss": 0.74, "step": 3076 }, { "epoch": 0.6207403064489183, "grad_norm": 0.36606308817863464, "learning_rate": 3.319445268493981e-06, "loss": 0.6657, "step": 3077 }, { "epoch": 0.6209420420051254, "grad_norm": 0.5328860878944397, "learning_rate": 3.316368009212818e-06, "loss": 0.7469, "step": 3078 }, { "epoch": 0.6211437775613323, "grad_norm": 0.33745500445365906, "learning_rate": 3.3132914690471657e-06, "loss": 0.6671, "step": 3079 }, { "epoch": 0.6213455131175394, "grad_norm": 0.3988508880138397, "learning_rate": 3.310215649311075e-06, "loss": 0.7865, "step": 3080 }, { "epoch": 0.6215472486737463, "grad_norm": 0.601906955242157, "learning_rate": 3.3071405513182996e-06, "loss": 0.7031, "step": 3081 }, { "epoch": 0.6217489842299533, "grad_norm": 0.8292602896690369, "learning_rate": 3.304066176382281e-06, "loss": 0.6719, "step": 3082 }, { "epoch": 0.6219507197861603, "grad_norm": 0.38256916403770447, "learning_rate": 3.300992525816147e-06, "loss": 0.6376, "step": 3083 }, { "epoch": 0.6221524553423673, "grad_norm": 0.6803867220878601, "learning_rate": 3.297919600932723e-06, "loss": 0.797, "step": 3084 }, { "epoch": 0.6223541908985742, "grad_norm": 0.461038738489151, "learning_rate": 3.2948474030445187e-06, "loss": 0.6493, "step": 3085 }, { "epoch": 0.6225559264547813, "grad_norm": 0.3889191150665283, "learning_rate": 3.2917759334637376e-06, "loss": 0.6486, "step": 3086 }, { "epoch": 0.6227576620109883, "grad_norm": 0.3315489888191223, "learning_rate": 3.288705193502272e-06, "loss": 0.6665, "step": 3087 }, { "epoch": 0.6229593975671953, "grad_norm": 0.7866511940956116, "learning_rate": 3.2856351844716983e-06, "loss": 1.0142, "step": 3088 }, { "epoch": 0.6231611331234023, "grad_norm": 0.48360222578048706, "learning_rate": 3.2825659076832848e-06, "loss": 0.7912, "step": 3089 }, { "epoch": 0.6233628686796092, "grad_norm": 0.4154527187347412, "learning_rate": 3.2794973644479884e-06, "loss": 0.6703, "step": 3090 }, { "epoch": 0.6235646042358163, "grad_norm": 0.3557988703250885, "learning_rate": 3.276429556076445e-06, "loss": 0.7134, "step": 3091 }, { "epoch": 0.6237663397920232, "grad_norm": 0.8320627212524414, "learning_rate": 3.2733624838789846e-06, "loss": 0.6844, "step": 3092 }, { "epoch": 0.6239680753482303, "grad_norm": 0.9393786191940308, "learning_rate": 3.2702961491656197e-06, "loss": 0.6608, "step": 3093 }, { "epoch": 0.6241698109044372, "grad_norm": 0.876509428024292, "learning_rate": 3.267230553246047e-06, "loss": 0.6976, "step": 3094 }, { "epoch": 0.6243715464606442, "grad_norm": 0.36942392587661743, "learning_rate": 3.26416569742965e-06, "loss": 0.6621, "step": 3095 }, { "epoch": 0.6245732820168513, "grad_norm": 0.5300217270851135, "learning_rate": 3.261101583025494e-06, "loss": 0.7066, "step": 3096 }, { "epoch": 0.6247750175730582, "grad_norm": 1.0432391166687012, "learning_rate": 3.258038211342327e-06, "loss": 0.6486, "step": 3097 }, { "epoch": 0.6249767531292653, "grad_norm": 0.483392596244812, "learning_rate": 3.254975583688585e-06, "loss": 0.6736, "step": 3098 }, { "epoch": 0.6251784886854722, "grad_norm": 1.1535712480545044, "learning_rate": 3.2519137013723775e-06, "loss": 0.68, "step": 3099 }, { "epoch": 0.6253802242416792, "grad_norm": 0.8637328743934631, "learning_rate": 3.2488525657015014e-06, "loss": 0.6433, "step": 3100 }, { "epoch": 0.6255819597978862, "grad_norm": 2.4023852348327637, "learning_rate": 3.2457921779834372e-06, "loss": 0.6498, "step": 3101 }, { "epoch": 0.6257836953540932, "grad_norm": 0.5843347907066345, "learning_rate": 3.2427325395253386e-06, "loss": 0.736, "step": 3102 }, { "epoch": 0.6259854309103001, "grad_norm": 0.5199889540672302, "learning_rate": 3.2396736516340443e-06, "loss": 0.6761, "step": 3103 }, { "epoch": 0.6261871664665072, "grad_norm": 0.45910605788230896, "learning_rate": 3.2366155156160726e-06, "loss": 0.8176, "step": 3104 }, { "epoch": 0.6263889020227141, "grad_norm": 0.9626832604408264, "learning_rate": 3.2335581327776178e-06, "loss": 0.6933, "step": 3105 }, { "epoch": 0.6265906375789212, "grad_norm": 0.3536245822906494, "learning_rate": 3.2305015044245534e-06, "loss": 0.7737, "step": 3106 }, { "epoch": 0.6267923731351281, "grad_norm": 0.5715398788452148, "learning_rate": 3.2274456318624344e-06, "loss": 0.7681, "step": 3107 }, { "epoch": 0.6269941086913351, "grad_norm": 0.6852706074714661, "learning_rate": 3.2243905163964863e-06, "loss": 0.6293, "step": 3108 }, { "epoch": 0.6271958442475422, "grad_norm": 0.5433623194694519, "learning_rate": 3.221336159331618e-06, "loss": 0.6476, "step": 3109 }, { "epoch": 0.6273975798037491, "grad_norm": 0.48686903715133667, "learning_rate": 3.218282561972407e-06, "loss": 0.6762, "step": 3110 }, { "epoch": 0.6275993153599562, "grad_norm": 0.47076553106307983, "learning_rate": 3.2152297256231137e-06, "loss": 0.6213, "step": 3111 }, { "epoch": 0.6278010509161631, "grad_norm": 0.5113581418991089, "learning_rate": 3.21217765158767e-06, "loss": 0.781, "step": 3112 }, { "epoch": 0.6280027864723701, "grad_norm": 0.3782995045185089, "learning_rate": 3.209126341169681e-06, "loss": 0.7276, "step": 3113 }, { "epoch": 0.6282045220285771, "grad_norm": 0.5735915899276733, "learning_rate": 3.2060757956724286e-06, "loss": 0.6898, "step": 3114 }, { "epoch": 0.6284062575847841, "grad_norm": 0.39048486948013306, "learning_rate": 3.203026016398867e-06, "loss": 0.6437, "step": 3115 }, { "epoch": 0.6286079931409911, "grad_norm": 0.6044697165489197, "learning_rate": 3.1999770046516198e-06, "loss": 0.6616, "step": 3116 }, { "epoch": 0.6288097286971981, "grad_norm": 0.3670637607574463, "learning_rate": 3.1969287617329887e-06, "loss": 0.8468, "step": 3117 }, { "epoch": 0.629011464253405, "grad_norm": 0.6543151140213013, "learning_rate": 3.1938812889449444e-06, "loss": 0.657, "step": 3118 }, { "epoch": 0.6292131998096121, "grad_norm": 0.32406216859817505, "learning_rate": 3.1908345875891243e-06, "loss": 0.774, "step": 3119 }, { "epoch": 0.629414935365819, "grad_norm": 0.44408175349235535, "learning_rate": 3.1877886589668423e-06, "loss": 1.0477, "step": 3120 }, { "epoch": 0.629616670922026, "grad_norm": 0.45666587352752686, "learning_rate": 3.1847435043790833e-06, "loss": 0.6835, "step": 3121 }, { "epoch": 0.6298184064782331, "grad_norm": 0.33028221130371094, "learning_rate": 3.181699125126493e-06, "loss": 0.6404, "step": 3122 }, { "epoch": 0.63002014203444, "grad_norm": 0.6274064183235168, "learning_rate": 3.178655522509395e-06, "loss": 0.6529, "step": 3123 }, { "epoch": 0.6302218775906471, "grad_norm": 0.7759518027305603, "learning_rate": 3.1756126978277756e-06, "loss": 0.6167, "step": 3124 }, { "epoch": 0.630423613146854, "grad_norm": 0.3837743401527405, "learning_rate": 3.1725706523812925e-06, "loss": 0.6501, "step": 3125 }, { "epoch": 0.630625348703061, "grad_norm": 0.5663520097732544, "learning_rate": 3.169529387469269e-06, "loss": 0.6922, "step": 3126 }, { "epoch": 0.630827084259268, "grad_norm": 0.6764240860939026, "learning_rate": 3.1664889043906928e-06, "loss": 0.6241, "step": 3127 }, { "epoch": 0.631028819815475, "grad_norm": 1.6318018436431885, "learning_rate": 3.1634492044442195e-06, "loss": 0.8103, "step": 3128 }, { "epoch": 0.631230555371682, "grad_norm": 0.42009663581848145, "learning_rate": 3.160410288928175e-06, "loss": 0.7029, "step": 3129 }, { "epoch": 0.631432290927889, "grad_norm": 0.49394020438194275, "learning_rate": 3.1573721591405405e-06, "loss": 1.0464, "step": 3130 }, { "epoch": 0.631634026484096, "grad_norm": 0.4648574888706207, "learning_rate": 3.154334816378969e-06, "loss": 0.6562, "step": 3131 }, { "epoch": 0.631835762040303, "grad_norm": 0.47843027114868164, "learning_rate": 3.151298261940775e-06, "loss": 0.7575, "step": 3132 }, { "epoch": 0.63203749759651, "grad_norm": 0.7636077404022217, "learning_rate": 3.148262497122935e-06, "loss": 0.8223, "step": 3133 }, { "epoch": 0.632239233152717, "grad_norm": 0.378098726272583, "learning_rate": 3.145227523222092e-06, "loss": 0.7765, "step": 3134 }, { "epoch": 0.632440968708924, "grad_norm": 0.5029193758964539, "learning_rate": 3.1421933415345473e-06, "loss": 0.6579, "step": 3135 }, { "epoch": 0.6326427042651309, "grad_norm": 0.6376214027404785, "learning_rate": 3.1391599533562644e-06, "loss": 0.6408, "step": 3136 }, { "epoch": 0.632844439821338, "grad_norm": 1.2196003198623657, "learning_rate": 3.1361273599828722e-06, "loss": 0.6752, "step": 3137 }, { "epoch": 0.6330461753775449, "grad_norm": 0.7059176564216614, "learning_rate": 3.1330955627096526e-06, "loss": 0.6517, "step": 3138 }, { "epoch": 0.633247910933752, "grad_norm": 0.6646839380264282, "learning_rate": 3.130064562831553e-06, "loss": 0.6575, "step": 3139 }, { "epoch": 0.6334496464899589, "grad_norm": 3.0043044090270996, "learning_rate": 3.1270343616431795e-06, "loss": 0.641, "step": 3140 }, { "epoch": 0.6336513820461659, "grad_norm": 0.34532269835472107, "learning_rate": 3.1240049604387955e-06, "loss": 0.7085, "step": 3141 }, { "epoch": 0.633853117602373, "grad_norm": 0.719542384147644, "learning_rate": 3.1209763605123233e-06, "loss": 0.6261, "step": 3142 }, { "epoch": 0.6340548531585799, "grad_norm": 0.4158155024051666, "learning_rate": 3.117948563157346e-06, "loss": 0.6272, "step": 3143 }, { "epoch": 0.6342565887147869, "grad_norm": 0.49435046315193176, "learning_rate": 3.1149215696670963e-06, "loss": 0.7016, "step": 3144 }, { "epoch": 0.6344583242709939, "grad_norm": 0.5325908660888672, "learning_rate": 3.111895381334472e-06, "loss": 0.7781, "step": 3145 }, { "epoch": 0.6346600598272009, "grad_norm": 0.48683398962020874, "learning_rate": 3.108869999452024e-06, "loss": 0.7391, "step": 3146 }, { "epoch": 0.6348617953834079, "grad_norm": 0.31066611409187317, "learning_rate": 3.105845425311954e-06, "loss": 0.6373, "step": 3147 }, { "epoch": 0.6350635309396149, "grad_norm": 0.6181333661079407, "learning_rate": 3.102821660206125e-06, "loss": 0.7017, "step": 3148 }, { "epoch": 0.6352652664958218, "grad_norm": 0.3839402198791504, "learning_rate": 3.099798705426055e-06, "loss": 0.787, "step": 3149 }, { "epoch": 0.6354670020520289, "grad_norm": 0.6559051871299744, "learning_rate": 3.0967765622629085e-06, "loss": 0.6686, "step": 3150 }, { "epoch": 0.6356687376082358, "grad_norm": 0.35817965865135193, "learning_rate": 3.0937552320075116e-06, "loss": 0.67, "step": 3151 }, { "epoch": 0.6358704731644429, "grad_norm": 0.5216743350028992, "learning_rate": 3.0907347159503364e-06, "loss": 0.6393, "step": 3152 }, { "epoch": 0.6360722087206498, "grad_norm": 0.3688260018825531, "learning_rate": 3.0877150153815126e-06, "loss": 0.7921, "step": 3153 }, { "epoch": 0.6362739442768568, "grad_norm": 1.2055909633636475, "learning_rate": 3.0846961315908206e-06, "loss": 0.7709, "step": 3154 }, { "epoch": 0.6364756798330639, "grad_norm": 0.5644564032554626, "learning_rate": 3.0816780658676857e-06, "loss": 0.6773, "step": 3155 }, { "epoch": 0.6366774153892708, "grad_norm": 0.6851127743721008, "learning_rate": 3.0786608195011938e-06, "loss": 0.6381, "step": 3156 }, { "epoch": 0.6368791509454779, "grad_norm": 0.33154991269111633, "learning_rate": 3.0756443937800757e-06, "loss": 0.6474, "step": 3157 }, { "epoch": 0.6370808865016848, "grad_norm": 0.6636943817138672, "learning_rate": 3.0726287899927075e-06, "loss": 0.7441, "step": 3158 }, { "epoch": 0.6372826220578918, "grad_norm": 0.4501652717590332, "learning_rate": 3.069614009427123e-06, "loss": 0.7027, "step": 3159 }, { "epoch": 0.6374843576140988, "grad_norm": 0.48597389459609985, "learning_rate": 3.0666000533709984e-06, "loss": 0.7337, "step": 3160 }, { "epoch": 0.6376860931703058, "grad_norm": 0.9662090539932251, "learning_rate": 3.063586923111658e-06, "loss": 0.7431, "step": 3161 }, { "epoch": 0.6378878287265127, "grad_norm": 0.403626024723053, "learning_rate": 3.0605746199360755e-06, "loss": 0.6392, "step": 3162 }, { "epoch": 0.6380895642827198, "grad_norm": 0.43942490220069885, "learning_rate": 3.057563145130873e-06, "loss": 0.6931, "step": 3163 }, { "epoch": 0.6382912998389267, "grad_norm": 0.5516834855079651, "learning_rate": 3.054552499982312e-06, "loss": 0.7797, "step": 3164 }, { "epoch": 0.6384930353951338, "grad_norm": 0.9289595484733582, "learning_rate": 3.0515426857763087e-06, "loss": 0.6549, "step": 3165 }, { "epoch": 0.6386947709513408, "grad_norm": 0.3789737820625305, "learning_rate": 3.0485337037984146e-06, "loss": 0.6868, "step": 3166 }, { "epoch": 0.6388965065075477, "grad_norm": 0.761205792427063, "learning_rate": 3.045525555333834e-06, "loss": 0.7468, "step": 3167 }, { "epoch": 0.6390982420637548, "grad_norm": 0.46798014640808105, "learning_rate": 3.0425182416674117e-06, "loss": 0.6358, "step": 3168 }, { "epoch": 0.6392999776199617, "grad_norm": 0.4467335641384125, "learning_rate": 3.0395117640836337e-06, "loss": 0.6945, "step": 3169 }, { "epoch": 0.6395017131761688, "grad_norm": 0.41640374064445496, "learning_rate": 3.0365061238666336e-06, "loss": 0.6827, "step": 3170 }, { "epoch": 0.6397034487323757, "grad_norm": 0.3545227348804474, "learning_rate": 3.0335013223001865e-06, "loss": 0.783, "step": 3171 }, { "epoch": 0.6399051842885827, "grad_norm": 1.406578540802002, "learning_rate": 3.0304973606677044e-06, "loss": 0.6595, "step": 3172 }, { "epoch": 0.6401069198447897, "grad_norm": 1.4224039316177368, "learning_rate": 3.027494240252246e-06, "loss": 0.6437, "step": 3173 }, { "epoch": 0.6403086554009967, "grad_norm": 0.3247050344944, "learning_rate": 3.024491962336511e-06, "loss": 0.6654, "step": 3174 }, { "epoch": 0.6405103909572037, "grad_norm": 0.3300015330314636, "learning_rate": 3.021490528202831e-06, "loss": 0.7923, "step": 3175 }, { "epoch": 0.6407121265134107, "grad_norm": 0.48039737343788147, "learning_rate": 3.018489939133188e-06, "loss": 0.691, "step": 3176 }, { "epoch": 0.6409138620696176, "grad_norm": 3.007985830307007, "learning_rate": 3.0154901964091993e-06, "loss": 0.7091, "step": 3177 }, { "epoch": 0.6411155976258247, "grad_norm": 0.9376941323280334, "learning_rate": 3.0124913013121148e-06, "loss": 0.7613, "step": 3178 }, { "epoch": 0.6413173331820317, "grad_norm": 1.181395411491394, "learning_rate": 3.009493255122831e-06, "loss": 0.6758, "step": 3179 }, { "epoch": 0.6415190687382386, "grad_norm": 0.3755936026573181, "learning_rate": 3.0064960591218763e-06, "loss": 0.7249, "step": 3180 }, { "epoch": 0.6417208042944457, "grad_norm": 0.40885961055755615, "learning_rate": 3.0034997145894178e-06, "loss": 0.689, "step": 3181 }, { "epoch": 0.6419225398506526, "grad_norm": 0.41958701610565186, "learning_rate": 3.0005042228052604e-06, "loss": 0.7136, "step": 3182 }, { "epoch": 0.6421242754068597, "grad_norm": 0.4304403066635132, "learning_rate": 2.9975095850488412e-06, "loss": 0.7982, "step": 3183 }, { "epoch": 0.6423260109630666, "grad_norm": 0.4791124761104584, "learning_rate": 2.9945158025992354e-06, "loss": 0.7134, "step": 3184 }, { "epoch": 0.6425277465192736, "grad_norm": 0.7359556555747986, "learning_rate": 2.991522876735154e-06, "loss": 0.8083, "step": 3185 }, { "epoch": 0.6427294820754806, "grad_norm": 0.48312368988990784, "learning_rate": 2.9885308087349364e-06, "loss": 0.6765, "step": 3186 }, { "epoch": 0.6429312176316876, "grad_norm": 0.6698987483978271, "learning_rate": 2.9855395998765607e-06, "loss": 0.6487, "step": 3187 }, { "epoch": 0.6431329531878947, "grad_norm": 0.44107022881507874, "learning_rate": 2.982549251437638e-06, "loss": 0.6513, "step": 3188 }, { "epoch": 0.6433346887441016, "grad_norm": 0.9434458017349243, "learning_rate": 2.979559764695409e-06, "loss": 0.6799, "step": 3189 }, { "epoch": 0.6435364243003086, "grad_norm": 0.5217306017875671, "learning_rate": 2.9765711409267484e-06, "loss": 0.6945, "step": 3190 }, { "epoch": 0.6437381598565156, "grad_norm": 0.6917257308959961, "learning_rate": 2.9735833814081627e-06, "loss": 0.7392, "step": 3191 }, { "epoch": 0.6439398954127226, "grad_norm": 1.2743409872055054, "learning_rate": 2.9705964874157865e-06, "loss": 0.6708, "step": 3192 }, { "epoch": 0.6441416309689296, "grad_norm": 0.4855993390083313, "learning_rate": 2.967610460225391e-06, "loss": 0.8801, "step": 3193 }, { "epoch": 0.6443433665251366, "grad_norm": 1.0808053016662598, "learning_rate": 2.964625301112366e-06, "loss": 0.7138, "step": 3194 }, { "epoch": 0.6445451020813435, "grad_norm": 0.6945085525512695, "learning_rate": 2.9616410113517405e-06, "loss": 0.8535, "step": 3195 }, { "epoch": 0.6447468376375506, "grad_norm": 0.3475915789604187, "learning_rate": 2.9586575922181724e-06, "loss": 0.8371, "step": 3196 }, { "epoch": 0.6449485731937575, "grad_norm": 0.6857219338417053, "learning_rate": 2.9556750449859396e-06, "loss": 0.6513, "step": 3197 }, { "epoch": 0.6451503087499645, "grad_norm": 0.4195650517940521, "learning_rate": 2.952693370928953e-06, "loss": 0.6669, "step": 3198 }, { "epoch": 0.6453520443061715, "grad_norm": 0.7740408182144165, "learning_rate": 2.9497125713207518e-06, "loss": 0.6673, "step": 3199 }, { "epoch": 0.6455537798623785, "grad_norm": 0.34402042627334595, "learning_rate": 2.9467326474344983e-06, "loss": 0.6437, "step": 3200 }, { "epoch": 0.6457555154185856, "grad_norm": 0.7659520506858826, "learning_rate": 2.943753600542982e-06, "loss": 0.6758, "step": 3201 }, { "epoch": 0.6459572509747925, "grad_norm": 0.3940645754337311, "learning_rate": 2.940775431918621e-06, "loss": 0.6867, "step": 3202 }, { "epoch": 0.6461589865309995, "grad_norm": 0.5474492311477661, "learning_rate": 2.9377981428334494e-06, "loss": 0.9003, "step": 3203 }, { "epoch": 0.6463607220872065, "grad_norm": 0.4333469867706299, "learning_rate": 2.9348217345591367e-06, "loss": 0.6497, "step": 3204 }, { "epoch": 0.6465624576434135, "grad_norm": 0.31425702571868896, "learning_rate": 2.9318462083669706e-06, "loss": 0.7416, "step": 3205 }, { "epoch": 0.6467641931996205, "grad_norm": 0.6604920625686646, "learning_rate": 2.9288715655278605e-06, "loss": 0.7005, "step": 3206 }, { "epoch": 0.6469659287558275, "grad_norm": 1.00324285030365, "learning_rate": 2.9258978073123413e-06, "loss": 0.7507, "step": 3207 }, { "epoch": 0.6471676643120344, "grad_norm": 0.3944019675254822, "learning_rate": 2.9229249349905686e-06, "loss": 0.8014, "step": 3208 }, { "epoch": 0.6473693998682415, "grad_norm": 0.4360457956790924, "learning_rate": 2.9199529498323207e-06, "loss": 0.7602, "step": 3209 }, { "epoch": 0.6475711354244484, "grad_norm": 0.4374183416366577, "learning_rate": 2.916981853106997e-06, "loss": 0.657, "step": 3210 }, { "epoch": 0.6477728709806555, "grad_norm": 0.335193008184433, "learning_rate": 2.9140116460836175e-06, "loss": 0.6482, "step": 3211 }, { "epoch": 0.6479746065368625, "grad_norm": 0.43707412481307983, "learning_rate": 2.9110423300308182e-06, "loss": 0.7353, "step": 3212 }, { "epoch": 0.6481763420930694, "grad_norm": 1.7312297821044922, "learning_rate": 2.9080739062168626e-06, "loss": 0.6955, "step": 3213 }, { "epoch": 0.6483780776492765, "grad_norm": 1.1668918132781982, "learning_rate": 2.9051063759096264e-06, "loss": 0.6381, "step": 3214 }, { "epoch": 0.6485798132054834, "grad_norm": 1.7101664543151855, "learning_rate": 2.9021397403766034e-06, "loss": 0.6756, "step": 3215 }, { "epoch": 0.6487815487616904, "grad_norm": 0.6171029210090637, "learning_rate": 2.8991740008849117e-06, "loss": 0.7846, "step": 3216 }, { "epoch": 0.6489832843178974, "grad_norm": 0.3693191707134247, "learning_rate": 2.896209158701281e-06, "loss": 0.6294, "step": 3217 }, { "epoch": 0.6491850198741044, "grad_norm": 0.40041401982307434, "learning_rate": 2.8932452150920576e-06, "loss": 0.909, "step": 3218 }, { "epoch": 0.6493867554303114, "grad_norm": 0.3019446134567261, "learning_rate": 2.89028217132321e-06, "loss": 0.6771, "step": 3219 }, { "epoch": 0.6495884909865184, "grad_norm": 0.5917576551437378, "learning_rate": 2.887320028660312e-06, "loss": 0.7757, "step": 3220 }, { "epoch": 0.6497902265427253, "grad_norm": 0.4102381467819214, "learning_rate": 2.884358788368563e-06, "loss": 0.6877, "step": 3221 }, { "epoch": 0.6499919620989324, "grad_norm": 0.39108479022979736, "learning_rate": 2.8813984517127723e-06, "loss": 0.7356, "step": 3222 }, { "epoch": 0.6501936976551393, "grad_norm": 0.30545833706855774, "learning_rate": 2.87843901995736e-06, "loss": 0.6435, "step": 3223 }, { "epoch": 0.6503954332113464, "grad_norm": 0.7336997389793396, "learning_rate": 2.875480494366367e-06, "loss": 0.7055, "step": 3224 }, { "epoch": 0.6505971687675534, "grad_norm": 0.31208398938179016, "learning_rate": 2.872522876203443e-06, "loss": 0.7695, "step": 3225 }, { "epoch": 0.6507989043237603, "grad_norm": 0.5648106336593628, "learning_rate": 2.8695661667318465e-06, "loss": 0.6477, "step": 3226 }, { "epoch": 0.6510006398799674, "grad_norm": 0.546466588973999, "learning_rate": 2.8666103672144597e-06, "loss": 0.7945, "step": 3227 }, { "epoch": 0.6512023754361743, "grad_norm": 0.4231576919555664, "learning_rate": 2.8636554789137587e-06, "loss": 0.6658, "step": 3228 }, { "epoch": 0.6514041109923814, "grad_norm": 0.6607463955879211, "learning_rate": 2.860701503091845e-06, "loss": 0.6531, "step": 3229 }, { "epoch": 0.6516058465485883, "grad_norm": 0.6346076130867004, "learning_rate": 2.8577484410104283e-06, "loss": 0.6459, "step": 3230 }, { "epoch": 0.6518075821047953, "grad_norm": 0.593224823474884, "learning_rate": 2.8547962939308187e-06, "loss": 0.6421, "step": 3231 }, { "epoch": 0.6520093176610023, "grad_norm": 0.3373255431652069, "learning_rate": 2.8518450631139467e-06, "loss": 0.7888, "step": 3232 }, { "epoch": 0.6522110532172093, "grad_norm": 0.4355120062828064, "learning_rate": 2.8488947498203445e-06, "loss": 0.6563, "step": 3233 }, { "epoch": 0.6524127887734164, "grad_norm": 1.12356698513031, "learning_rate": 2.8459453553101526e-06, "loss": 0.6683, "step": 3234 }, { "epoch": 0.6526145243296233, "grad_norm": 0.37360039353370667, "learning_rate": 2.8429968808431275e-06, "loss": 0.7465, "step": 3235 }, { "epoch": 0.6528162598858303, "grad_norm": 0.5207381844520569, "learning_rate": 2.840049327678618e-06, "loss": 0.7123, "step": 3236 }, { "epoch": 0.6530179954420373, "grad_norm": 0.47542569041252136, "learning_rate": 2.8371026970755903e-06, "loss": 0.6815, "step": 3237 }, { "epoch": 0.6532197309982443, "grad_norm": 0.49312570691108704, "learning_rate": 2.8341569902926198e-06, "loss": 0.6957, "step": 3238 }, { "epoch": 0.6534214665544512, "grad_norm": 0.39625513553619385, "learning_rate": 2.8312122085878725e-06, "loss": 0.7877, "step": 3239 }, { "epoch": 0.6536232021106583, "grad_norm": 1.3322360515594482, "learning_rate": 2.8282683532191333e-06, "loss": 0.7897, "step": 3240 }, { "epoch": 0.6538249376668652, "grad_norm": 1.1602030992507935, "learning_rate": 2.825325425443786e-06, "loss": 0.67, "step": 3241 }, { "epoch": 0.6540266732230723, "grad_norm": 0.5422742962837219, "learning_rate": 2.8223834265188154e-06, "loss": 0.9461, "step": 3242 }, { "epoch": 0.6542284087792792, "grad_norm": 1.4504823684692383, "learning_rate": 2.8194423577008167e-06, "loss": 0.6599, "step": 3243 }, { "epoch": 0.6544301443354862, "grad_norm": 0.8231688737869263, "learning_rate": 2.816502220245982e-06, "loss": 0.6647, "step": 3244 }, { "epoch": 0.6546318798916932, "grad_norm": 1.09976327419281, "learning_rate": 2.8135630154101044e-06, "loss": 0.6879, "step": 3245 }, { "epoch": 0.6548336154479002, "grad_norm": 0.37235990166664124, "learning_rate": 2.810624744448588e-06, "loss": 0.6403, "step": 3246 }, { "epoch": 0.6550353510041073, "grad_norm": 0.3631633520126343, "learning_rate": 2.807687408616427e-06, "loss": 0.6878, "step": 3247 }, { "epoch": 0.6552370865603142, "grad_norm": 0.5159462094306946, "learning_rate": 2.8047510091682223e-06, "loss": 0.6744, "step": 3248 }, { "epoch": 0.6554388221165212, "grad_norm": 0.81461101770401, "learning_rate": 2.801815547358173e-06, "loss": 0.6666, "step": 3249 }, { "epoch": 0.6556405576727282, "grad_norm": 0.5176712870597839, "learning_rate": 2.7988810244400766e-06, "loss": 0.6644, "step": 3250 }, { "epoch": 0.6558422932289352, "grad_norm": 0.4347783625125885, "learning_rate": 2.795947441667334e-06, "loss": 0.6467, "step": 3251 }, { "epoch": 0.6560440287851422, "grad_norm": 0.5249154567718506, "learning_rate": 2.79301480029294e-06, "loss": 0.6968, "step": 3252 }, { "epoch": 0.6562457643413492, "grad_norm": 0.4152076244354248, "learning_rate": 2.7900831015694884e-06, "loss": 0.8355, "step": 3253 }, { "epoch": 0.6564474998975561, "grad_norm": 0.6886522769927979, "learning_rate": 2.787152346749173e-06, "loss": 0.6496, "step": 3254 }, { "epoch": 0.6566492354537632, "grad_norm": 0.4261277914047241, "learning_rate": 2.784222537083781e-06, "loss": 0.6611, "step": 3255 }, { "epoch": 0.6568509710099701, "grad_norm": 0.41764065623283386, "learning_rate": 2.7812936738246977e-06, "loss": 0.7819, "step": 3256 }, { "epoch": 0.6570527065661771, "grad_norm": 0.4790710508823395, "learning_rate": 2.7783657582229006e-06, "loss": 0.6821, "step": 3257 }, { "epoch": 0.6572544421223842, "grad_norm": 0.7666770815849304, "learning_rate": 2.775438791528971e-06, "loss": 0.6297, "step": 3258 }, { "epoch": 0.6574561776785911, "grad_norm": 0.3256152272224426, "learning_rate": 2.7725127749930768e-06, "loss": 0.6162, "step": 3259 }, { "epoch": 0.6576579132347982, "grad_norm": 0.39912089705467224, "learning_rate": 2.7695877098649828e-06, "loss": 0.7555, "step": 3260 }, { "epoch": 0.6578596487910051, "grad_norm": 0.3404659330844879, "learning_rate": 2.766663597394044e-06, "loss": 0.6616, "step": 3261 }, { "epoch": 0.6580613843472121, "grad_norm": 0.7698021531105042, "learning_rate": 2.7637404388292184e-06, "loss": 0.6816, "step": 3262 }, { "epoch": 0.6582631199034191, "grad_norm": 0.4176563620567322, "learning_rate": 2.760818235419046e-06, "loss": 0.6545, "step": 3263 }, { "epoch": 0.6584648554596261, "grad_norm": 0.9553281664848328, "learning_rate": 2.757896988411662e-06, "loss": 0.6904, "step": 3264 }, { "epoch": 0.6586665910158331, "grad_norm": 0.3938625752925873, "learning_rate": 2.7549766990547973e-06, "loss": 0.7029, "step": 3265 }, { "epoch": 0.6588683265720401, "grad_norm": 0.5802119970321655, "learning_rate": 2.7520573685957685e-06, "loss": 0.7366, "step": 3266 }, { "epoch": 0.659070062128247, "grad_norm": 0.4900028109550476, "learning_rate": 2.7491389982814846e-06, "loss": 0.6723, "step": 3267 }, { "epoch": 0.6592717976844541, "grad_norm": 0.47692182660102844, "learning_rate": 2.746221589358443e-06, "loss": 0.8045, "step": 3268 }, { "epoch": 0.659473533240661, "grad_norm": 0.3741524815559387, "learning_rate": 2.7433051430727353e-06, "loss": 0.6767, "step": 3269 }, { "epoch": 0.6596752687968681, "grad_norm": 1.0025602579116821, "learning_rate": 2.7403896606700363e-06, "loss": 0.86, "step": 3270 }, { "epoch": 0.6598770043530751, "grad_norm": 0.3644028306007385, "learning_rate": 2.7374751433956103e-06, "loss": 0.6842, "step": 3271 }, { "epoch": 0.660078739909282, "grad_norm": 0.7453895211219788, "learning_rate": 2.734561592494314e-06, "loss": 0.6767, "step": 3272 }, { "epoch": 0.6602804754654891, "grad_norm": 0.6459788084030151, "learning_rate": 2.7316490092105856e-06, "loss": 0.6485, "step": 3273 }, { "epoch": 0.660482211021696, "grad_norm": 0.9067775011062622, "learning_rate": 2.7287373947884523e-06, "loss": 0.7792, "step": 3274 }, { "epoch": 0.660683946577903, "grad_norm": 0.42903923988342285, "learning_rate": 2.7258267504715276e-06, "loss": 0.6674, "step": 3275 }, { "epoch": 0.66088568213411, "grad_norm": 0.4832659363746643, "learning_rate": 2.7229170775030078e-06, "loss": 0.6905, "step": 3276 }, { "epoch": 0.661087417690317, "grad_norm": 0.3498452305793762, "learning_rate": 2.720008377125682e-06, "loss": 0.7758, "step": 3277 }, { "epoch": 0.661289153246524, "grad_norm": 0.5607253909111023, "learning_rate": 2.7171006505819153e-06, "loss": 0.6126, "step": 3278 }, { "epoch": 0.661490888802731, "grad_norm": 0.6438088417053223, "learning_rate": 2.7141938991136597e-06, "loss": 0.6824, "step": 3279 }, { "epoch": 0.6616926243589379, "grad_norm": 0.5905462503433228, "learning_rate": 2.711288123962455e-06, "loss": 0.6258, "step": 3280 }, { "epoch": 0.661894359915145, "grad_norm": 0.3438940942287445, "learning_rate": 2.7083833263694182e-06, "loss": 0.6689, "step": 3281 }, { "epoch": 0.662096095471352, "grad_norm": 0.49798575043678284, "learning_rate": 2.7054795075752494e-06, "loss": 0.8654, "step": 3282 }, { "epoch": 0.662297831027559, "grad_norm": 0.5611802339553833, "learning_rate": 2.702576668820237e-06, "loss": 0.6325, "step": 3283 }, { "epoch": 0.662499566583766, "grad_norm": 0.3897034823894501, "learning_rate": 2.6996748113442397e-06, "loss": 0.7167, "step": 3284 }, { "epoch": 0.6627013021399729, "grad_norm": 0.30359792709350586, "learning_rate": 2.696773936386706e-06, "loss": 0.7844, "step": 3285 }, { "epoch": 0.66290303769618, "grad_norm": 0.7668525576591492, "learning_rate": 2.6938740451866674e-06, "loss": 0.7037, "step": 3286 }, { "epoch": 0.6631047732523869, "grad_norm": 0.38820183277130127, "learning_rate": 2.690975138982721e-06, "loss": 0.6852, "step": 3287 }, { "epoch": 0.663306508808594, "grad_norm": 0.4365038275718689, "learning_rate": 2.6880772190130576e-06, "loss": 0.7843, "step": 3288 }, { "epoch": 0.6635082443648009, "grad_norm": 0.4769340753555298, "learning_rate": 2.6851802865154403e-06, "loss": 0.8124, "step": 3289 }, { "epoch": 0.6637099799210079, "grad_norm": 1.5809507369995117, "learning_rate": 2.6822843427272094e-06, "loss": 0.6859, "step": 3290 }, { "epoch": 0.663911715477215, "grad_norm": 0.3428616523742676, "learning_rate": 2.6793893888852885e-06, "loss": 0.6811, "step": 3291 }, { "epoch": 0.6641134510334219, "grad_norm": 0.5171772837638855, "learning_rate": 2.676495426226172e-06, "loss": 0.7681, "step": 3292 }, { "epoch": 0.6643151865896288, "grad_norm": 1.7425131797790527, "learning_rate": 2.6736024559859335e-06, "loss": 0.8144, "step": 3293 }, { "epoch": 0.6645169221458359, "grad_norm": 0.3819194734096527, "learning_rate": 2.6707104794002283e-06, "loss": 0.673, "step": 3294 }, { "epoch": 0.6647186577020429, "grad_norm": 0.41104787588119507, "learning_rate": 2.6678194977042727e-06, "loss": 0.8096, "step": 3295 }, { "epoch": 0.6649203932582499, "grad_norm": 1.118825078010559, "learning_rate": 2.6649295121328745e-06, "loss": 0.6665, "step": 3296 }, { "epoch": 0.6651221288144569, "grad_norm": 0.38198497891426086, "learning_rate": 2.6620405239204066e-06, "loss": 0.6618, "step": 3297 }, { "epoch": 0.6653238643706638, "grad_norm": 5.1874237060546875, "learning_rate": 2.6591525343008164e-06, "loss": 0.7858, "step": 3298 }, { "epoch": 0.6655255999268709, "grad_norm": 0.32285603880882263, "learning_rate": 2.6562655445076296e-06, "loss": 0.7059, "step": 3299 }, { "epoch": 0.6657273354830778, "grad_norm": 0.6265811920166016, "learning_rate": 2.6533795557739407e-06, "loss": 0.6951, "step": 3300 }, { "epoch": 0.6659290710392849, "grad_norm": 0.3196472227573395, "learning_rate": 2.650494569332415e-06, "loss": 0.6171, "step": 3301 }, { "epoch": 0.6661308065954918, "grad_norm": 0.39705124497413635, "learning_rate": 2.647610586415299e-06, "loss": 0.6602, "step": 3302 }, { "epoch": 0.6663325421516988, "grad_norm": 0.44583508372306824, "learning_rate": 2.644727608254396e-06, "loss": 0.6791, "step": 3303 }, { "epoch": 0.6665342777079059, "grad_norm": 0.3522447645664215, "learning_rate": 2.6418456360810918e-06, "loss": 0.6254, "step": 3304 }, { "epoch": 0.6667360132641128, "grad_norm": 0.7186568379402161, "learning_rate": 2.6389646711263417e-06, "loss": 0.6565, "step": 3305 }, { "epoch": 0.6669377488203199, "grad_norm": 0.7441889047622681, "learning_rate": 2.6360847146206624e-06, "loss": 0.6145, "step": 3306 }, { "epoch": 0.6671394843765268, "grad_norm": 0.3817283809185028, "learning_rate": 2.633205767794149e-06, "loss": 0.6737, "step": 3307 }, { "epoch": 0.6673412199327338, "grad_norm": 1.152697205543518, "learning_rate": 2.6303278318764613e-06, "loss": 0.6301, "step": 3308 }, { "epoch": 0.6675429554889408, "grad_norm": 0.7283335328102112, "learning_rate": 2.6274509080968252e-06, "loss": 0.7515, "step": 3309 }, { "epoch": 0.6677446910451478, "grad_norm": 0.6745430827140808, "learning_rate": 2.6245749976840406e-06, "loss": 0.6622, "step": 3310 }, { "epoch": 0.6679464266013547, "grad_norm": 0.3779880702495575, "learning_rate": 2.621700101866469e-06, "loss": 0.6511, "step": 3311 }, { "epoch": 0.6681481621575618, "grad_norm": 0.40893808007240295, "learning_rate": 2.618826221872039e-06, "loss": 0.6498, "step": 3312 }, { "epoch": 0.6683498977137687, "grad_norm": 0.3666467070579529, "learning_rate": 2.61595335892825e-06, "loss": 0.6254, "step": 3313 }, { "epoch": 0.6685516332699758, "grad_norm": 0.8473466634750366, "learning_rate": 2.6130815142621614e-06, "loss": 0.7145, "step": 3314 }, { "epoch": 0.6687533688261827, "grad_norm": 0.6159899830818176, "learning_rate": 2.6102106891004002e-06, "loss": 0.6858, "step": 3315 }, { "epoch": 0.6689551043823897, "grad_norm": 0.5568851828575134, "learning_rate": 2.6073408846691582e-06, "loss": 0.7757, "step": 3316 }, { "epoch": 0.6691568399385968, "grad_norm": 0.6199424862861633, "learning_rate": 2.6044721021941887e-06, "loss": 0.6802, "step": 3317 }, { "epoch": 0.6693585754948037, "grad_norm": 0.3639499247074127, "learning_rate": 2.601604342900814e-06, "loss": 0.6053, "step": 3318 }, { "epoch": 0.6695603110510108, "grad_norm": 0.4354953467845917, "learning_rate": 2.5987376080139136e-06, "loss": 0.6559, "step": 3319 }, { "epoch": 0.6697620466072177, "grad_norm": 0.481569766998291, "learning_rate": 2.5958718987579313e-06, "loss": 0.6343, "step": 3320 }, { "epoch": 0.6699637821634247, "grad_norm": 0.4260782301425934, "learning_rate": 2.5930072163568752e-06, "loss": 0.7423, "step": 3321 }, { "epoch": 0.6701655177196317, "grad_norm": 0.46539121866226196, "learning_rate": 2.590143562034312e-06, "loss": 0.7246, "step": 3322 }, { "epoch": 0.6703672532758387, "grad_norm": 0.53533536195755, "learning_rate": 2.5872809370133704e-06, "loss": 0.6694, "step": 3323 }, { "epoch": 0.6705689888320457, "grad_norm": 0.783592700958252, "learning_rate": 2.5844193425167374e-06, "loss": 0.6935, "step": 3324 }, { "epoch": 0.6707707243882527, "grad_norm": 0.364938884973526, "learning_rate": 2.581558779766664e-06, "loss": 0.6913, "step": 3325 }, { "epoch": 0.6709724599444596, "grad_norm": 0.3413155674934387, "learning_rate": 2.5786992499849584e-06, "loss": 0.7899, "step": 3326 }, { "epoch": 0.6711741955006667, "grad_norm": 0.6075354814529419, "learning_rate": 2.575840754392984e-06, "loss": 0.6825, "step": 3327 }, { "epoch": 0.6713759310568737, "grad_norm": 0.6059809923171997, "learning_rate": 2.5729832942116705e-06, "loss": 0.6617, "step": 3328 }, { "epoch": 0.6715776666130807, "grad_norm": 0.49375784397125244, "learning_rate": 2.570126870661499e-06, "loss": 0.6622, "step": 3329 }, { "epoch": 0.6717794021692877, "grad_norm": 0.46630603075027466, "learning_rate": 2.5672714849625084e-06, "loss": 0.6755, "step": 3330 }, { "epoch": 0.6719811377254946, "grad_norm": 0.5884243249893188, "learning_rate": 2.5644171383342965e-06, "loss": 0.6426, "step": 3331 }, { "epoch": 0.6721828732817017, "grad_norm": 1.362908124923706, "learning_rate": 2.5615638319960133e-06, "loss": 0.6702, "step": 3332 }, { "epoch": 0.6723846088379086, "grad_norm": 1.3717610836029053, "learning_rate": 2.5587115671663732e-06, "loss": 0.7833, "step": 3333 }, { "epoch": 0.6725863443941156, "grad_norm": 0.3735629618167877, "learning_rate": 2.555860345063636e-06, "loss": 0.6799, "step": 3334 }, { "epoch": 0.6727880799503226, "grad_norm": 0.8326557874679565, "learning_rate": 2.553010166905619e-06, "loss": 0.6809, "step": 3335 }, { "epoch": 0.6729898155065296, "grad_norm": 0.7335181832313538, "learning_rate": 2.5501610339096987e-06, "loss": 0.6632, "step": 3336 }, { "epoch": 0.6731915510627366, "grad_norm": 2.6886038780212402, "learning_rate": 2.547312947292799e-06, "loss": 0.7022, "step": 3337 }, { "epoch": 0.6733932866189436, "grad_norm": 0.48173320293426514, "learning_rate": 2.5444659082713978e-06, "loss": 0.8309, "step": 3338 }, { "epoch": 0.6735950221751505, "grad_norm": 1.1179413795471191, "learning_rate": 2.5416199180615297e-06, "loss": 0.6615, "step": 3339 }, { "epoch": 0.6737967577313576, "grad_norm": 0.6155341267585754, "learning_rate": 2.5387749778787775e-06, "loss": 0.67, "step": 3340 }, { "epoch": 0.6739984932875646, "grad_norm": 0.5885789394378662, "learning_rate": 2.535931088938274e-06, "loss": 0.6407, "step": 3341 }, { "epoch": 0.6742002288437716, "grad_norm": 0.6222666501998901, "learning_rate": 2.5330882524547107e-06, "loss": 0.8534, "step": 3342 }, { "epoch": 0.6744019643999786, "grad_norm": 0.6253111362457275, "learning_rate": 2.530246469642318e-06, "loss": 0.6266, "step": 3343 }, { "epoch": 0.6746036999561855, "grad_norm": 0.3401983976364136, "learning_rate": 2.5274057417148866e-06, "loss": 0.6534, "step": 3344 }, { "epoch": 0.6748054355123926, "grad_norm": 0.6543856263160706, "learning_rate": 2.524566069885752e-06, "loss": 0.7743, "step": 3345 }, { "epoch": 0.6750071710685995, "grad_norm": 0.5683391690254211, "learning_rate": 2.5217274553677975e-06, "loss": 0.7044, "step": 3346 }, { "epoch": 0.6752089066248066, "grad_norm": 0.34736117720603943, "learning_rate": 2.5188898993734594e-06, "loss": 0.8291, "step": 3347 }, { "epoch": 0.6754106421810135, "grad_norm": 0.389544814825058, "learning_rate": 2.5160534031147175e-06, "loss": 0.659, "step": 3348 }, { "epoch": 0.6756123777372205, "grad_norm": 1.0548666715621948, "learning_rate": 2.5132179678030995e-06, "loss": 0.686, "step": 3349 }, { "epoch": 0.6758141132934276, "grad_norm": 0.40486329793930054, "learning_rate": 2.5103835946496846e-06, "loss": 0.784, "step": 3350 }, { "epoch": 0.6760158488496345, "grad_norm": 0.37598225474357605, "learning_rate": 2.507550284865089e-06, "loss": 0.7081, "step": 3351 }, { "epoch": 0.6762175844058415, "grad_norm": 0.5921080708503723, "learning_rate": 2.504718039659483e-06, "loss": 0.7131, "step": 3352 }, { "epoch": 0.6764193199620485, "grad_norm": 0.35993483662605286, "learning_rate": 2.5018868602425846e-06, "loss": 0.9476, "step": 3353 }, { "epoch": 0.6766210555182555, "grad_norm": 0.45840582251548767, "learning_rate": 2.499056747823642e-06, "loss": 0.7662, "step": 3354 }, { "epoch": 0.6768227910744625, "grad_norm": 0.6039509773254395, "learning_rate": 2.4962277036114648e-06, "loss": 0.6869, "step": 3355 }, { "epoch": 0.6770245266306695, "grad_norm": 0.6492475867271423, "learning_rate": 2.493399728814396e-06, "loss": 0.6659, "step": 3356 }, { "epoch": 0.6772262621868764, "grad_norm": 0.3330156207084656, "learning_rate": 2.4905728246403226e-06, "loss": 0.7897, "step": 3357 }, { "epoch": 0.6774279977430835, "grad_norm": 0.5170933604240417, "learning_rate": 2.4877469922966823e-06, "loss": 0.8187, "step": 3358 }, { "epoch": 0.6776297332992904, "grad_norm": 0.5229772925376892, "learning_rate": 2.484922232990441e-06, "loss": 0.698, "step": 3359 }, { "epoch": 0.6778314688554975, "grad_norm": 0.40928876399993896, "learning_rate": 2.4820985479281184e-06, "loss": 0.6308, "step": 3360 }, { "epoch": 0.6780332044117044, "grad_norm": 0.6356455087661743, "learning_rate": 2.479275938315775e-06, "loss": 0.6408, "step": 3361 }, { "epoch": 0.6782349399679114, "grad_norm": 0.405601441860199, "learning_rate": 2.4764544053590005e-06, "loss": 0.9088, "step": 3362 }, { "epoch": 0.6784366755241185, "grad_norm": 0.5116518139839172, "learning_rate": 2.4736339502629385e-06, "loss": 0.7813, "step": 3363 }, { "epoch": 0.6786384110803254, "grad_norm": 0.4038514196872711, "learning_rate": 2.4708145742322643e-06, "loss": 0.67, "step": 3364 }, { "epoch": 0.6788401466365325, "grad_norm": 0.44164231419563293, "learning_rate": 2.4679962784711915e-06, "loss": 0.6275, "step": 3365 }, { "epoch": 0.6790418821927394, "grad_norm": 0.45643317699432373, "learning_rate": 2.4651790641834788e-06, "loss": 0.726, "step": 3366 }, { "epoch": 0.6792436177489464, "grad_norm": 1.0231688022613525, "learning_rate": 2.4623629325724186e-06, "loss": 0.751, "step": 3367 }, { "epoch": 0.6794453533051534, "grad_norm": 0.40992307662963867, "learning_rate": 2.4595478848408377e-06, "loss": 0.6971, "step": 3368 }, { "epoch": 0.6796470888613604, "grad_norm": 0.6936296820640564, "learning_rate": 2.4567339221911086e-06, "loss": 0.6235, "step": 3369 }, { "epoch": 0.6798488244175673, "grad_norm": 0.5006226897239685, "learning_rate": 2.4539210458251333e-06, "loss": 0.6386, "step": 3370 }, { "epoch": 0.6800505599737744, "grad_norm": 0.424445778131485, "learning_rate": 2.4511092569443518e-06, "loss": 0.6737, "step": 3371 }, { "epoch": 0.6802522955299813, "grad_norm": 0.6313624978065491, "learning_rate": 2.4482985567497395e-06, "loss": 0.7417, "step": 3372 }, { "epoch": 0.6804540310861884, "grad_norm": 0.6919423341751099, "learning_rate": 2.4454889464418052e-06, "loss": 0.6836, "step": 3373 }, { "epoch": 0.6806557666423954, "grad_norm": 1.5375241041183472, "learning_rate": 2.4426804272205985e-06, "loss": 0.6692, "step": 3374 }, { "epoch": 0.6808575021986023, "grad_norm": 1.3544081449508667, "learning_rate": 2.4398730002856958e-06, "loss": 0.8126, "step": 3375 }, { "epoch": 0.6810592377548094, "grad_norm": 0.39466825127601624, "learning_rate": 2.437066666836208e-06, "loss": 0.6005, "step": 3376 }, { "epoch": 0.6812609733110163, "grad_norm": 0.6122323870658875, "learning_rate": 2.434261428070785e-06, "loss": 0.6519, "step": 3377 }, { "epoch": 0.6814627088672234, "grad_norm": 1.1176679134368896, "learning_rate": 2.4314572851876016e-06, "loss": 0.6268, "step": 3378 }, { "epoch": 0.6816644444234303, "grad_norm": 0.4447222352027893, "learning_rate": 2.4286542393843665e-06, "loss": 0.6636, "step": 3379 }, { "epoch": 0.6818661799796373, "grad_norm": 0.8198567628860474, "learning_rate": 2.425852291858325e-06, "loss": 0.6355, "step": 3380 }, { "epoch": 0.6820679155358443, "grad_norm": 0.8290604948997498, "learning_rate": 2.423051443806247e-06, "loss": 0.6412, "step": 3381 }, { "epoch": 0.6822696510920513, "grad_norm": 0.42411890625953674, "learning_rate": 2.4202516964244347e-06, "loss": 0.6699, "step": 3382 }, { "epoch": 0.6824713866482583, "grad_norm": 0.4729047119617462, "learning_rate": 2.4174530509087193e-06, "loss": 0.7003, "step": 3383 }, { "epoch": 0.6826731222044653, "grad_norm": 0.6185622215270996, "learning_rate": 2.4146555084544665e-06, "loss": 0.6598, "step": 3384 }, { "epoch": 0.6828748577606722, "grad_norm": 0.6863579750061035, "learning_rate": 2.4118590702565643e-06, "loss": 0.6678, "step": 3385 }, { "epoch": 0.6830765933168793, "grad_norm": 0.34752097725868225, "learning_rate": 2.4090637375094323e-06, "loss": 0.6857, "step": 3386 }, { "epoch": 0.6832783288730863, "grad_norm": 0.3720141649246216, "learning_rate": 2.4062695114070156e-06, "loss": 0.8465, "step": 3387 }, { "epoch": 0.6834800644292932, "grad_norm": 0.5772742629051208, "learning_rate": 2.4034763931427917e-06, "loss": 0.6451, "step": 3388 }, { "epoch": 0.6836817999855003, "grad_norm": 0.488405704498291, "learning_rate": 2.40068438390976e-06, "loss": 0.746, "step": 3389 }, { "epoch": 0.6838835355417072, "grad_norm": 0.48354852199554443, "learning_rate": 2.3978934849004477e-06, "loss": 0.6652, "step": 3390 }, { "epoch": 0.6840852710979143, "grad_norm": 0.5033637881278992, "learning_rate": 2.395103697306906e-06, "loss": 0.6791, "step": 3391 }, { "epoch": 0.6842870066541212, "grad_norm": 0.4847155809402466, "learning_rate": 2.3923150223207176e-06, "loss": 0.7022, "step": 3392 }, { "epoch": 0.6844887422103282, "grad_norm": 0.5099661946296692, "learning_rate": 2.3895274611329826e-06, "loss": 0.6725, "step": 3393 }, { "epoch": 0.6846904777665352, "grad_norm": 0.49720004200935364, "learning_rate": 2.3867410149343284e-06, "loss": 0.7407, "step": 3394 }, { "epoch": 0.6848922133227422, "grad_norm": 0.7642709016799927, "learning_rate": 2.383955684914908e-06, "loss": 0.6789, "step": 3395 }, { "epoch": 0.6850939488789493, "grad_norm": 0.4113779366016388, "learning_rate": 2.3811714722643954e-06, "loss": 0.922, "step": 3396 }, { "epoch": 0.6852956844351562, "grad_norm": 0.6020419001579285, "learning_rate": 2.3783883781719857e-06, "loss": 0.6606, "step": 3397 }, { "epoch": 0.6854974199913632, "grad_norm": 4.857358455657959, "learning_rate": 2.3756064038264033e-06, "loss": 0.7021, "step": 3398 }, { "epoch": 0.6856991555475702, "grad_norm": 0.2977675497531891, "learning_rate": 2.3728255504158827e-06, "loss": 0.834, "step": 3399 }, { "epoch": 0.6859008911037772, "grad_norm": 0.39246630668640137, "learning_rate": 2.3700458191281913e-06, "loss": 0.6267, "step": 3400 }, { "epoch": 0.6861026266599842, "grad_norm": 0.57973712682724, "learning_rate": 2.3672672111506104e-06, "loss": 0.7184, "step": 3401 }, { "epoch": 0.6863043622161912, "grad_norm": 0.3879867196083069, "learning_rate": 2.3644897276699426e-06, "loss": 0.6248, "step": 3402 }, { "epoch": 0.6865060977723981, "grad_norm": 0.6185869574546814, "learning_rate": 2.3617133698725137e-06, "loss": 0.824, "step": 3403 }, { "epoch": 0.6867078333286052, "grad_norm": 0.4408000111579895, "learning_rate": 2.358938138944164e-06, "loss": 0.6526, "step": 3404 }, { "epoch": 0.6869095688848121, "grad_norm": 0.5326852798461914, "learning_rate": 2.3561640360702525e-06, "loss": 0.7627, "step": 3405 }, { "epoch": 0.6871113044410191, "grad_norm": 1.1812150478363037, "learning_rate": 2.353391062435665e-06, "loss": 0.6987, "step": 3406 }, { "epoch": 0.6873130399972261, "grad_norm": 0.4056047797203064, "learning_rate": 2.3506192192247893e-06, "loss": 0.8717, "step": 3407 }, { "epoch": 0.6875147755534331, "grad_norm": 0.4317014515399933, "learning_rate": 2.3478485076215444e-06, "loss": 0.7081, "step": 3408 }, { "epoch": 0.6877165111096402, "grad_norm": 0.37390607595443726, "learning_rate": 2.3450789288093646e-06, "loss": 0.7089, "step": 3409 }, { "epoch": 0.6879182466658471, "grad_norm": 0.5218164920806885, "learning_rate": 2.342310483971188e-06, "loss": 0.6372, "step": 3410 }, { "epoch": 0.6881199822220541, "grad_norm": 0.42212942242622375, "learning_rate": 2.3395431742894836e-06, "loss": 0.6279, "step": 3411 }, { "epoch": 0.6883217177782611, "grad_norm": 0.37575986981391907, "learning_rate": 2.336777000946227e-06, "loss": 0.6768, "step": 3412 }, { "epoch": 0.6885234533344681, "grad_norm": 0.5202623009681702, "learning_rate": 2.334011965122909e-06, "loss": 0.819, "step": 3413 }, { "epoch": 0.6887251888906751, "grad_norm": 0.46885550022125244, "learning_rate": 2.331248068000539e-06, "loss": 0.733, "step": 3414 }, { "epoch": 0.6889269244468821, "grad_norm": 0.7351022362709045, "learning_rate": 2.328485310759635e-06, "loss": 0.665, "step": 3415 }, { "epoch": 0.689128660003089, "grad_norm": 0.6357284188270569, "learning_rate": 2.3257236945802292e-06, "loss": 0.6295, "step": 3416 }, { "epoch": 0.6893303955592961, "grad_norm": 0.3708368241786957, "learning_rate": 2.3229632206418727e-06, "loss": 0.7518, "step": 3417 }, { "epoch": 0.689532131115503, "grad_norm": 0.8100972771644592, "learning_rate": 2.3202038901236157e-06, "loss": 0.6639, "step": 3418 }, { "epoch": 0.6897338666717101, "grad_norm": 0.47530439496040344, "learning_rate": 2.317445704204033e-06, "loss": 0.6972, "step": 3419 }, { "epoch": 0.689935602227917, "grad_norm": 0.4638540744781494, "learning_rate": 2.3146886640612045e-06, "loss": 0.6631, "step": 3420 }, { "epoch": 0.690137337784124, "grad_norm": 0.501802921295166, "learning_rate": 2.3119327708727187e-06, "loss": 0.7817, "step": 3421 }, { "epoch": 0.6903390733403311, "grad_norm": 0.46486896276474, "learning_rate": 2.3091780258156805e-06, "loss": 0.7204, "step": 3422 }, { "epoch": 0.690540808896538, "grad_norm": 0.9126157760620117, "learning_rate": 2.3064244300667e-06, "loss": 0.6801, "step": 3423 }, { "epoch": 0.6907425444527451, "grad_norm": 0.42806971073150635, "learning_rate": 2.3036719848018942e-06, "loss": 0.6358, "step": 3424 }, { "epoch": 0.690944280008952, "grad_norm": 0.30322638154029846, "learning_rate": 2.3009206911968984e-06, "loss": 0.6602, "step": 3425 }, { "epoch": 0.691146015565159, "grad_norm": 0.4272553324699402, "learning_rate": 2.2981705504268415e-06, "loss": 0.6592, "step": 3426 }, { "epoch": 0.691347751121366, "grad_norm": 0.6703861355781555, "learning_rate": 2.295421563666372e-06, "loss": 0.6591, "step": 3427 }, { "epoch": 0.691549486677573, "grad_norm": 0.5437256693840027, "learning_rate": 2.292673732089644e-06, "loss": 0.6519, "step": 3428 }, { "epoch": 0.6917512222337799, "grad_norm": 0.39333558082580566, "learning_rate": 2.2899270568703096e-06, "loss": 0.731, "step": 3429 }, { "epoch": 0.691952957789987, "grad_norm": 0.7978708744049072, "learning_rate": 2.2871815391815377e-06, "loss": 0.8463, "step": 3430 }, { "epoch": 0.692154693346194, "grad_norm": 0.37055301666259766, "learning_rate": 2.2844371801959965e-06, "loss": 0.6729, "step": 3431 }, { "epoch": 0.692356428902401, "grad_norm": 0.4103529453277588, "learning_rate": 2.281693981085859e-06, "loss": 1.1052, "step": 3432 }, { "epoch": 0.692558164458608, "grad_norm": 0.4024542570114136, "learning_rate": 2.2789519430228084e-06, "loss": 0.6711, "step": 3433 }, { "epoch": 0.6927599000148149, "grad_norm": 0.9240769147872925, "learning_rate": 2.2762110671780263e-06, "loss": 0.6759, "step": 3434 }, { "epoch": 0.692961635571022, "grad_norm": 0.5204855799674988, "learning_rate": 2.2734713547221976e-06, "loss": 0.6282, "step": 3435 }, { "epoch": 0.6931633711272289, "grad_norm": 0.7119300365447998, "learning_rate": 2.270732806825517e-06, "loss": 0.844, "step": 3436 }, { "epoch": 0.693365106683436, "grad_norm": 0.4279820919036865, "learning_rate": 2.2679954246576754e-06, "loss": 0.6066, "step": 3437 }, { "epoch": 0.6935668422396429, "grad_norm": 0.3645946681499481, "learning_rate": 2.265259209387867e-06, "loss": 0.6672, "step": 3438 }, { "epoch": 0.6937685777958499, "grad_norm": 0.35464754700660706, "learning_rate": 2.262524162184789e-06, "loss": 0.6452, "step": 3439 }, { "epoch": 0.6939703133520569, "grad_norm": 0.4235239028930664, "learning_rate": 2.2597902842166366e-06, "loss": 0.6188, "step": 3440 }, { "epoch": 0.6941720489082639, "grad_norm": 0.6387593150138855, "learning_rate": 2.2570575766511115e-06, "loss": 0.6596, "step": 3441 }, { "epoch": 0.694373784464471, "grad_norm": 0.4509795904159546, "learning_rate": 2.254326040655412e-06, "loss": 0.7899, "step": 3442 }, { "epoch": 0.6945755200206779, "grad_norm": 1.2646552324295044, "learning_rate": 2.2515956773962315e-06, "loss": 0.6117, "step": 3443 }, { "epoch": 0.6947772555768849, "grad_norm": 0.4494341313838959, "learning_rate": 2.2488664880397726e-06, "loss": 0.8168, "step": 3444 }, { "epoch": 0.6949789911330919, "grad_norm": 0.43138113617897034, "learning_rate": 2.2461384737517283e-06, "loss": 0.6593, "step": 3445 }, { "epoch": 0.6951807266892989, "grad_norm": 1.265650749206543, "learning_rate": 2.2434116356972927e-06, "loss": 0.6795, "step": 3446 }, { "epoch": 0.6953824622455058, "grad_norm": 0.7934123277664185, "learning_rate": 2.240685975041155e-06, "loss": 0.6411, "step": 3447 }, { "epoch": 0.6955841978017129, "grad_norm": 0.4912724196910858, "learning_rate": 2.237961492947507e-06, "loss": 0.6871, "step": 3448 }, { "epoch": 0.6957859333579198, "grad_norm": 0.4764300286769867, "learning_rate": 2.2352381905800325e-06, "loss": 0.6373, "step": 3449 }, { "epoch": 0.6959876689141269, "grad_norm": 0.6754266023635864, "learning_rate": 2.23251606910191e-06, "loss": 0.6287, "step": 3450 }, { "epoch": 0.6961894044703338, "grad_norm": 0.8246191143989563, "learning_rate": 2.2297951296758203e-06, "loss": 0.6471, "step": 3451 }, { "epoch": 0.6963911400265408, "grad_norm": 1.140021562576294, "learning_rate": 2.227075373463934e-06, "loss": 0.8084, "step": 3452 }, { "epoch": 0.6965928755827479, "grad_norm": 0.5953003764152527, "learning_rate": 2.2243568016279167e-06, "loss": 0.7284, "step": 3453 }, { "epoch": 0.6967946111389548, "grad_norm": 0.3563099801540375, "learning_rate": 2.221639415328928e-06, "loss": 0.6609, "step": 3454 }, { "epoch": 0.6969963466951619, "grad_norm": 0.3192271888256073, "learning_rate": 2.2189232157276247e-06, "loss": 0.6704, "step": 3455 }, { "epoch": 0.6971980822513688, "grad_norm": 1.3229840993881226, "learning_rate": 2.216208203984154e-06, "loss": 0.7263, "step": 3456 }, { "epoch": 0.6973998178075758, "grad_norm": 0.6065989136695862, "learning_rate": 2.2134943812581544e-06, "loss": 0.6356, "step": 3457 }, { "epoch": 0.6976015533637828, "grad_norm": 0.48584502935409546, "learning_rate": 2.210781748708757e-06, "loss": 0.6244, "step": 3458 }, { "epoch": 0.6978032889199898, "grad_norm": 0.5548803806304932, "learning_rate": 2.2080703074945894e-06, "loss": 0.6113, "step": 3459 }, { "epoch": 0.6980050244761968, "grad_norm": 0.6471741199493408, "learning_rate": 2.205360058773764e-06, "loss": 0.6809, "step": 3460 }, { "epoch": 0.6982067600324038, "grad_norm": 0.5157921314239502, "learning_rate": 2.202651003703885e-06, "loss": 0.6554, "step": 3461 }, { "epoch": 0.6984084955886107, "grad_norm": 0.4163440465927124, "learning_rate": 2.199943143442052e-06, "loss": 0.6819, "step": 3462 }, { "epoch": 0.6986102311448178, "grad_norm": 0.3754201829433441, "learning_rate": 2.1972364791448488e-06, "loss": 0.6534, "step": 3463 }, { "epoch": 0.6988119667010247, "grad_norm": 0.6087068319320679, "learning_rate": 2.194531011968348e-06, "loss": 0.7028, "step": 3464 }, { "epoch": 0.6990137022572317, "grad_norm": 0.7134766578674316, "learning_rate": 2.1918267430681184e-06, "loss": 0.6727, "step": 3465 }, { "epoch": 0.6992154378134388, "grad_norm": 0.4389779567718506, "learning_rate": 2.1891236735992044e-06, "loss": 0.6918, "step": 3466 }, { "epoch": 0.6994171733696457, "grad_norm": 0.8246768116950989, "learning_rate": 2.18642180471615e-06, "loss": 0.7179, "step": 3467 }, { "epoch": 0.6996189089258528, "grad_norm": 1.204871654510498, "learning_rate": 2.1837211375729812e-06, "loss": 0.6657, "step": 3468 }, { "epoch": 0.6998206444820597, "grad_norm": 0.44929951429367065, "learning_rate": 2.181021673323208e-06, "loss": 0.6923, "step": 3469 }, { "epoch": 0.7000223800382667, "grad_norm": 0.862220823764801, "learning_rate": 2.178323413119834e-06, "loss": 0.8135, "step": 3470 }, { "epoch": 0.7002241155944737, "grad_norm": 0.5849249958992004, "learning_rate": 2.1756263581153427e-06, "loss": 0.6884, "step": 3471 }, { "epoch": 0.7004258511506807, "grad_norm": 0.8377878069877625, "learning_rate": 2.1729305094617016e-06, "loss": 0.7828, "step": 3472 }, { "epoch": 0.7006275867068877, "grad_norm": 0.6120553016662598, "learning_rate": 2.170235868310372e-06, "loss": 0.6917, "step": 3473 }, { "epoch": 0.7008293222630947, "grad_norm": 0.5731369853019714, "learning_rate": 2.167542435812286e-06, "loss": 0.6501, "step": 3474 }, { "epoch": 0.7010310578193016, "grad_norm": 0.5575307607650757, "learning_rate": 2.16485021311787e-06, "loss": 0.6643, "step": 3475 }, { "epoch": 0.7012327933755087, "grad_norm": 0.5431971549987793, "learning_rate": 2.162159201377034e-06, "loss": 0.8182, "step": 3476 }, { "epoch": 0.7014345289317157, "grad_norm": 0.5338951349258423, "learning_rate": 2.1594694017391604e-06, "loss": 0.6866, "step": 3477 }, { "epoch": 0.7016362644879227, "grad_norm": 0.31306493282318115, "learning_rate": 2.156780815353125e-06, "loss": 0.6562, "step": 3478 }, { "epoch": 0.7018380000441297, "grad_norm": 0.8306349515914917, "learning_rate": 2.15409344336728e-06, "loss": 0.6357, "step": 3479 }, { "epoch": 0.7020397356003366, "grad_norm": 0.4163278639316559, "learning_rate": 2.151407286929458e-06, "loss": 0.6896, "step": 3480 }, { "epoch": 0.7022414711565437, "grad_norm": 0.6696937680244446, "learning_rate": 2.1487223471869793e-06, "loss": 0.6761, "step": 3481 }, { "epoch": 0.7024432067127506, "grad_norm": 0.3525125980377197, "learning_rate": 2.1460386252866327e-06, "loss": 0.6543, "step": 3482 }, { "epoch": 0.7026449422689576, "grad_norm": 0.5017916560173035, "learning_rate": 2.143356122374697e-06, "loss": 0.684, "step": 3483 }, { "epoch": 0.7028466778251646, "grad_norm": 0.5240808725357056, "learning_rate": 2.140674839596931e-06, "loss": 0.6856, "step": 3484 }, { "epoch": 0.7030484133813716, "grad_norm": 0.4257897436618805, "learning_rate": 2.1379947780985603e-06, "loss": 0.8324, "step": 3485 }, { "epoch": 0.7032501489375786, "grad_norm": 0.4334295094013214, "learning_rate": 2.1353159390243035e-06, "loss": 0.6699, "step": 3486 }, { "epoch": 0.7034518844937856, "grad_norm": 0.3629571497440338, "learning_rate": 2.132638323518348e-06, "loss": 0.6528, "step": 3487 }, { "epoch": 0.7036536200499925, "grad_norm": 0.469107449054718, "learning_rate": 2.129961932724359e-06, "loss": 0.6625, "step": 3488 }, { "epoch": 0.7038553556061996, "grad_norm": 0.4057089686393738, "learning_rate": 2.1272867677854853e-06, "loss": 0.9242, "step": 3489 }, { "epoch": 0.7040570911624066, "grad_norm": 0.7630118131637573, "learning_rate": 2.124612829844345e-06, "loss": 0.6927, "step": 3490 }, { "epoch": 0.7042588267186136, "grad_norm": 0.35844919085502625, "learning_rate": 2.121940120043033e-06, "loss": 0.8758, "step": 3491 }, { "epoch": 0.7044605622748206, "grad_norm": 0.7905420660972595, "learning_rate": 2.119268639523124e-06, "loss": 0.707, "step": 3492 }, { "epoch": 0.7046622978310275, "grad_norm": 0.4721790850162506, "learning_rate": 2.1165983894256647e-06, "loss": 0.6773, "step": 3493 }, { "epoch": 0.7048640333872346, "grad_norm": 0.34849104285240173, "learning_rate": 2.113929370891176e-06, "loss": 0.6873, "step": 3494 }, { "epoch": 0.7050657689434415, "grad_norm": 0.3804130554199219, "learning_rate": 2.1112615850596518e-06, "loss": 0.6803, "step": 3495 }, { "epoch": 0.7052675044996486, "grad_norm": 0.9019719958305359, "learning_rate": 2.1085950330705613e-06, "loss": 0.802, "step": 3496 }, { "epoch": 0.7054692400558555, "grad_norm": 0.37380659580230713, "learning_rate": 2.105929716062848e-06, "loss": 0.6707, "step": 3497 }, { "epoch": 0.7056709756120625, "grad_norm": 0.3215074837207794, "learning_rate": 2.103265635174926e-06, "loss": 0.6622, "step": 3498 }, { "epoch": 0.7058727111682696, "grad_norm": 0.8287083506584167, "learning_rate": 2.1006027915446785e-06, "loss": 0.785, "step": 3499 }, { "epoch": 0.7060744467244765, "grad_norm": 0.3489381968975067, "learning_rate": 2.0979411863094677e-06, "loss": 0.7725, "step": 3500 }, { "epoch": 0.7062761822806835, "grad_norm": 0.38414129614830017, "learning_rate": 2.095280820606121e-06, "loss": 0.8481, "step": 3501 }, { "epoch": 0.7064779178368905, "grad_norm": 0.8350195288658142, "learning_rate": 2.0926216955709355e-06, "loss": 0.8169, "step": 3502 }, { "epoch": 0.7066796533930975, "grad_norm": 0.41042858362197876, "learning_rate": 2.0899638123396847e-06, "loss": 0.8681, "step": 3503 }, { "epoch": 0.7068813889493045, "grad_norm": 0.36468562483787537, "learning_rate": 2.0873071720476067e-06, "loss": 0.6322, "step": 3504 }, { "epoch": 0.7070831245055115, "grad_norm": 0.6368513107299805, "learning_rate": 2.084651775829409e-06, "loss": 0.7808, "step": 3505 }, { "epoch": 0.7072848600617184, "grad_norm": 0.4741699993610382, "learning_rate": 2.0819976248192664e-06, "loss": 0.7226, "step": 3506 }, { "epoch": 0.7074865956179255, "grad_norm": 1.307973027229309, "learning_rate": 2.0793447201508288e-06, "loss": 0.6619, "step": 3507 }, { "epoch": 0.7076883311741324, "grad_norm": 1.281394124031067, "learning_rate": 2.0766930629572057e-06, "loss": 0.7279, "step": 3508 }, { "epoch": 0.7078900667303395, "grad_norm": 0.7751509547233582, "learning_rate": 2.0740426543709783e-06, "loss": 0.6551, "step": 3509 }, { "epoch": 0.7080918022865464, "grad_norm": 0.9136074781417847, "learning_rate": 2.071393495524191e-06, "loss": 0.7842, "step": 3510 }, { "epoch": 0.7082935378427534, "grad_norm": 0.6318010687828064, "learning_rate": 2.0687455875483603e-06, "loss": 0.6892, "step": 3511 }, { "epoch": 0.7084952733989605, "grad_norm": 0.5524263381958008, "learning_rate": 2.0660989315744624e-06, "loss": 0.7104, "step": 3512 }, { "epoch": 0.7086970089551674, "grad_norm": 0.6849998831748962, "learning_rate": 2.0634535287329416e-06, "loss": 0.7497, "step": 3513 }, { "epoch": 0.7088987445113745, "grad_norm": 0.5287047028541565, "learning_rate": 2.060809380153705e-06, "loss": 0.6998, "step": 3514 }, { "epoch": 0.7091004800675814, "grad_norm": 0.3894512355327606, "learning_rate": 2.058166486966128e-06, "loss": 0.7638, "step": 3515 }, { "epoch": 0.7093022156237884, "grad_norm": 0.6588128805160522, "learning_rate": 2.0555248502990473e-06, "loss": 0.6625, "step": 3516 }, { "epoch": 0.7095039511799954, "grad_norm": 1.2160587310791016, "learning_rate": 2.0528844712807588e-06, "loss": 0.6813, "step": 3517 }, { "epoch": 0.7097056867362024, "grad_norm": 0.5312079787254333, "learning_rate": 2.05024535103903e-06, "loss": 0.6315, "step": 3518 }, { "epoch": 0.7099074222924093, "grad_norm": 0.4518246054649353, "learning_rate": 2.0476074907010853e-06, "loss": 0.8286, "step": 3519 }, { "epoch": 0.7101091578486164, "grad_norm": 0.4721282422542572, "learning_rate": 2.044970891393608e-06, "loss": 0.7441, "step": 3520 }, { "epoch": 0.7103108934048233, "grad_norm": 0.43414926528930664, "learning_rate": 2.042335554242752e-06, "loss": 0.6512, "step": 3521 }, { "epoch": 0.7105126289610304, "grad_norm": 0.4045238494873047, "learning_rate": 2.039701480374121e-06, "loss": 0.6694, "step": 3522 }, { "epoch": 0.7107143645172374, "grad_norm": 0.4696193039417267, "learning_rate": 2.0370686709127885e-06, "loss": 0.6588, "step": 3523 }, { "epoch": 0.7109161000734443, "grad_norm": 2.4165115356445312, "learning_rate": 2.0344371269832834e-06, "loss": 0.7298, "step": 3524 }, { "epoch": 0.7111178356296514, "grad_norm": 0.5125167965888977, "learning_rate": 2.031806849709593e-06, "loss": 0.69, "step": 3525 }, { "epoch": 0.7113195711858583, "grad_norm": 0.4065336287021637, "learning_rate": 2.0291778402151685e-06, "loss": 0.663, "step": 3526 }, { "epoch": 0.7115213067420654, "grad_norm": 0.6046711802482605, "learning_rate": 2.026550099622914e-06, "loss": 0.698, "step": 3527 }, { "epoch": 0.7117230422982723, "grad_norm": 0.886115550994873, "learning_rate": 2.0239236290551946e-06, "loss": 0.6353, "step": 3528 }, { "epoch": 0.7119247778544793, "grad_norm": 0.6150587201118469, "learning_rate": 2.021298429633834e-06, "loss": 0.8005, "step": 3529 }, { "epoch": 0.7121265134106863, "grad_norm": 0.8583071827888489, "learning_rate": 2.01867450248011e-06, "loss": 0.7305, "step": 3530 }, { "epoch": 0.7123282489668933, "grad_norm": 1.2208572626113892, "learning_rate": 2.016051848714758e-06, "loss": 0.7093, "step": 3531 }, { "epoch": 0.7125299845231003, "grad_norm": 0.5126375555992126, "learning_rate": 2.0134304694579737e-06, "loss": 0.6569, "step": 3532 }, { "epoch": 0.7127317200793073, "grad_norm": 1.7419109344482422, "learning_rate": 2.0108103658293982e-06, "loss": 1.2269, "step": 3533 }, { "epoch": 0.7129334556355142, "grad_norm": 0.8870477676391602, "learning_rate": 2.008191538948139e-06, "loss": 0.6706, "step": 3534 }, { "epoch": 0.7131351911917213, "grad_norm": 0.6514060497283936, "learning_rate": 2.005573989932753e-06, "loss": 0.7235, "step": 3535 }, { "epoch": 0.7133369267479283, "grad_norm": 0.5482868552207947, "learning_rate": 2.0029577199012496e-06, "loss": 0.7448, "step": 3536 }, { "epoch": 0.7135386623041353, "grad_norm": 0.6119092702865601, "learning_rate": 2.0003427299710966e-06, "loss": 0.6549, "step": 3537 }, { "epoch": 0.7137403978603423, "grad_norm": 0.35968175530433655, "learning_rate": 1.9977290212592116e-06, "loss": 0.672, "step": 3538 }, { "epoch": 0.7139421334165492, "grad_norm": 0.9781152606010437, "learning_rate": 1.9951165948819646e-06, "loss": 0.6671, "step": 3539 }, { "epoch": 0.7141438689727563, "grad_norm": 0.5758945941925049, "learning_rate": 1.9925054519551833e-06, "loss": 0.8566, "step": 3540 }, { "epoch": 0.7143456045289632, "grad_norm": 1.2972782850265503, "learning_rate": 1.989895593594137e-06, "loss": 0.6791, "step": 3541 }, { "epoch": 0.7145473400851702, "grad_norm": 0.7332674264907837, "learning_rate": 1.987287020913556e-06, "loss": 0.7392, "step": 3542 }, { "epoch": 0.7147490756413772, "grad_norm": 0.35885506868362427, "learning_rate": 1.984679735027621e-06, "loss": 0.6427, "step": 3543 }, { "epoch": 0.7149508111975842, "grad_norm": 0.8930257558822632, "learning_rate": 1.9820737370499533e-06, "loss": 0.7992, "step": 3544 }, { "epoch": 0.7151525467537913, "grad_norm": 0.38415658473968506, "learning_rate": 1.979469028093635e-06, "loss": 0.7889, "step": 3545 }, { "epoch": 0.7153542823099982, "grad_norm": 0.41735655069351196, "learning_rate": 1.9768656092711934e-06, "loss": 0.7093, "step": 3546 }, { "epoch": 0.7155560178662052, "grad_norm": 0.3132960796356201, "learning_rate": 1.974263481694602e-06, "loss": 0.6574, "step": 3547 }, { "epoch": 0.7157577534224122, "grad_norm": 0.8277822732925415, "learning_rate": 1.9716626464752896e-06, "loss": 0.6673, "step": 3548 }, { "epoch": 0.7159594889786192, "grad_norm": 0.7590373158454895, "learning_rate": 1.9690631047241267e-06, "loss": 0.6493, "step": 3549 }, { "epoch": 0.7161612245348262, "grad_norm": 0.5453904867172241, "learning_rate": 1.9664648575514316e-06, "loss": 0.6598, "step": 3550 }, { "epoch": 0.7163629600910332, "grad_norm": 0.4179770350456238, "learning_rate": 1.963867906066978e-06, "loss": 0.7093, "step": 3551 }, { "epoch": 0.7165646956472401, "grad_norm": 0.3255173861980438, "learning_rate": 1.9612722513799714e-06, "loss": 0.8126, "step": 3552 }, { "epoch": 0.7167664312034472, "grad_norm": 0.6156246662139893, "learning_rate": 1.9586778945990785e-06, "loss": 0.6545, "step": 3553 }, { "epoch": 0.7169681667596541, "grad_norm": 1.3091801404953003, "learning_rate": 1.9560848368324024e-06, "loss": 0.6836, "step": 3554 }, { "epoch": 0.7171699023158612, "grad_norm": 1.0971754789352417, "learning_rate": 1.953493079187493e-06, "loss": 0.9682, "step": 3555 }, { "epoch": 0.7173716378720681, "grad_norm": 1.8149343729019165, "learning_rate": 1.9509026227713487e-06, "loss": 0.6908, "step": 3556 }, { "epoch": 0.7175733734282751, "grad_norm": 0.5145043134689331, "learning_rate": 1.948313468690407e-06, "loss": 0.6739, "step": 3557 }, { "epoch": 0.7177751089844822, "grad_norm": 0.4650350511074066, "learning_rate": 1.9457256180505507e-06, "loss": 0.645, "step": 3558 }, { "epoch": 0.7179768445406891, "grad_norm": 0.39293235540390015, "learning_rate": 1.9431390719571096e-06, "loss": 0.8465, "step": 3559 }, { "epoch": 0.7181785800968961, "grad_norm": 0.564153254032135, "learning_rate": 1.940553831514852e-06, "loss": 0.6868, "step": 3560 }, { "epoch": 0.7183803156531031, "grad_norm": 0.373946875333786, "learning_rate": 1.9379698978279886e-06, "loss": 0.62, "step": 3561 }, { "epoch": 0.7185820512093101, "grad_norm": 0.7261645197868347, "learning_rate": 1.935387272000175e-06, "loss": 0.6233, "step": 3562 }, { "epoch": 0.7187837867655171, "grad_norm": 0.46915268898010254, "learning_rate": 1.932805955134503e-06, "loss": 0.6634, "step": 3563 }, { "epoch": 0.7189855223217241, "grad_norm": 1.2086641788482666, "learning_rate": 1.9302259483335123e-06, "loss": 0.6569, "step": 3564 }, { "epoch": 0.719187257877931, "grad_norm": 0.520490825176239, "learning_rate": 1.9276472526991785e-06, "loss": 0.6266, "step": 3565 }, { "epoch": 0.7193889934341381, "grad_norm": 0.367524117231369, "learning_rate": 1.925069869332916e-06, "loss": 0.7286, "step": 3566 }, { "epoch": 0.719590728990345, "grad_norm": 0.3706212639808655, "learning_rate": 1.9224937993355846e-06, "loss": 0.7045, "step": 3567 }, { "epoch": 0.7197924645465521, "grad_norm": 0.6001324653625488, "learning_rate": 1.9199190438074767e-06, "loss": 0.6536, "step": 3568 }, { "epoch": 0.719994200102759, "grad_norm": 0.4473857581615448, "learning_rate": 1.9173456038483244e-06, "loss": 0.7551, "step": 3569 }, { "epoch": 0.720195935658966, "grad_norm": 0.5655458569526672, "learning_rate": 1.914773480557304e-06, "loss": 0.6492, "step": 3570 }, { "epoch": 0.7203976712151731, "grad_norm": 0.651198148727417, "learning_rate": 1.9122026750330213e-06, "loss": 0.6725, "step": 3571 }, { "epoch": 0.72059940677138, "grad_norm": 1.1214271783828735, "learning_rate": 1.9096331883735237e-06, "loss": 0.7058, "step": 3572 }, { "epoch": 0.7208011423275871, "grad_norm": 0.4652702510356903, "learning_rate": 1.9070650216762927e-06, "loss": 0.7483, "step": 3573 }, { "epoch": 0.721002877883794, "grad_norm": 0.4463115930557251, "learning_rate": 1.9044981760382502e-06, "loss": 0.6675, "step": 3574 }, { "epoch": 0.721204613440001, "grad_norm": 0.4675081670284271, "learning_rate": 1.9019326525557508e-06, "loss": 0.6827, "step": 3575 }, { "epoch": 0.721406348996208, "grad_norm": 1.2795963287353516, "learning_rate": 1.8993684523245842e-06, "loss": 0.6651, "step": 3576 }, { "epoch": 0.721608084552415, "grad_norm": 0.4441238045692444, "learning_rate": 1.896805576439974e-06, "loss": 0.691, "step": 3577 }, { "epoch": 0.7218098201086219, "grad_norm": 0.4510972797870636, "learning_rate": 1.8942440259965833e-06, "loss": 0.702, "step": 3578 }, { "epoch": 0.722011555664829, "grad_norm": 0.4195414185523987, "learning_rate": 1.891683802088503e-06, "loss": 0.6515, "step": 3579 }, { "epoch": 0.722213291221036, "grad_norm": 0.3845515847206116, "learning_rate": 1.8891249058092609e-06, "loss": 0.6648, "step": 3580 }, { "epoch": 0.722415026777243, "grad_norm": 0.5072640776634216, "learning_rate": 1.8865673382518146e-06, "loss": 0.6789, "step": 3581 }, { "epoch": 0.72261676233345, "grad_norm": 0.8547187447547913, "learning_rate": 1.8840111005085598e-06, "loss": 0.6483, "step": 3582 }, { "epoch": 0.7228184978896569, "grad_norm": 0.5213097929954529, "learning_rate": 1.8814561936713195e-06, "loss": 0.6461, "step": 3583 }, { "epoch": 0.723020233445864, "grad_norm": 3.7029454708099365, "learning_rate": 1.878902618831347e-06, "loss": 0.6881, "step": 3584 }, { "epoch": 0.7232219690020709, "grad_norm": 0.5109856128692627, "learning_rate": 1.8763503770793323e-06, "loss": 0.6796, "step": 3585 }, { "epoch": 0.723423704558278, "grad_norm": 0.512251079082489, "learning_rate": 1.8737994695053924e-06, "loss": 0.6862, "step": 3586 }, { "epoch": 0.7236254401144849, "grad_norm": 0.5038248896598816, "learning_rate": 1.8712498971990723e-06, "loss": 0.6449, "step": 3587 }, { "epoch": 0.7238271756706919, "grad_norm": 0.7192299365997314, "learning_rate": 1.8687016612493542e-06, "loss": 0.6209, "step": 3588 }, { "epoch": 0.7240289112268989, "grad_norm": 0.8483881950378418, "learning_rate": 1.8661547627446386e-06, "loss": 0.6305, "step": 3589 }, { "epoch": 0.7242306467831059, "grad_norm": 0.4165220260620117, "learning_rate": 1.8636092027727653e-06, "loss": 0.6671, "step": 3590 }, { "epoch": 0.724432382339313, "grad_norm": 0.505834698677063, "learning_rate": 1.8610649824209958e-06, "loss": 0.6653, "step": 3591 }, { "epoch": 0.7246341178955199, "grad_norm": 0.44209980964660645, "learning_rate": 1.8585221027760209e-06, "loss": 0.6618, "step": 3592 }, { "epoch": 0.7248358534517269, "grad_norm": 1.2882723808288574, "learning_rate": 1.8559805649239614e-06, "loss": 0.7678, "step": 3593 }, { "epoch": 0.7250375890079339, "grad_norm": 0.33671534061431885, "learning_rate": 1.8534403699503622e-06, "loss": 0.8213, "step": 3594 }, { "epoch": 0.7252393245641409, "grad_norm": 0.6320860981941223, "learning_rate": 1.850901518940193e-06, "loss": 0.6464, "step": 3595 }, { "epoch": 0.7254410601203478, "grad_norm": 0.37541621923446655, "learning_rate": 1.8483640129778575e-06, "loss": 0.6329, "step": 3596 }, { "epoch": 0.7256427956765549, "grad_norm": 0.3355324864387512, "learning_rate": 1.8458278531471712e-06, "loss": 0.7638, "step": 3597 }, { "epoch": 0.7258445312327618, "grad_norm": 0.6320685744285583, "learning_rate": 1.8432930405313871e-06, "loss": 0.7502, "step": 3598 }, { "epoch": 0.7260462667889689, "grad_norm": 0.5309786200523376, "learning_rate": 1.8407595762131814e-06, "loss": 0.6699, "step": 3599 }, { "epoch": 0.7262480023451758, "grad_norm": 0.4337455630302429, "learning_rate": 1.8382274612746447e-06, "loss": 0.654, "step": 3600 }, { "epoch": 0.7264497379013828, "grad_norm": 0.6220811605453491, "learning_rate": 1.8356966967973027e-06, "loss": 0.6629, "step": 3601 }, { "epoch": 0.7266514734575898, "grad_norm": 0.6550498008728027, "learning_rate": 1.833167283862098e-06, "loss": 0.6817, "step": 3602 }, { "epoch": 0.7268532090137968, "grad_norm": 0.3714157044887543, "learning_rate": 1.8306392235493946e-06, "loss": 0.8233, "step": 3603 }, { "epoch": 0.7270549445700039, "grad_norm": 0.41854748129844666, "learning_rate": 1.8281125169389868e-06, "loss": 0.6654, "step": 3604 }, { "epoch": 0.7272566801262108, "grad_norm": 0.7979785203933716, "learning_rate": 1.825587165110082e-06, "loss": 0.6649, "step": 3605 }, { "epoch": 0.7274584156824178, "grad_norm": 0.6475619077682495, "learning_rate": 1.823063169141312e-06, "loss": 0.8118, "step": 3606 }, { "epoch": 0.7276601512386248, "grad_norm": 0.7294145226478577, "learning_rate": 1.8205405301107343e-06, "loss": 0.6871, "step": 3607 }, { "epoch": 0.7278618867948318, "grad_norm": 0.3854321539402008, "learning_rate": 1.818019249095816e-06, "loss": 0.6471, "step": 3608 }, { "epoch": 0.7280636223510388, "grad_norm": 0.8849390745162964, "learning_rate": 1.815499327173455e-06, "loss": 0.8196, "step": 3609 }, { "epoch": 0.7282653579072458, "grad_norm": 1.1094517707824707, "learning_rate": 1.8129807654199628e-06, "loss": 0.6813, "step": 3610 }, { "epoch": 0.7284670934634527, "grad_norm": 0.33865752816200256, "learning_rate": 1.8104635649110702e-06, "loss": 0.766, "step": 3611 }, { "epoch": 0.7286688290196598, "grad_norm": 0.3571256995201111, "learning_rate": 1.8079477267219308e-06, "loss": 0.6486, "step": 3612 }, { "epoch": 0.7288705645758667, "grad_norm": 0.7489669322967529, "learning_rate": 1.8054332519271118e-06, "loss": 0.6319, "step": 3613 }, { "epoch": 0.7290723001320737, "grad_norm": 1.4024207592010498, "learning_rate": 1.8029201416005976e-06, "loss": 0.696, "step": 3614 }, { "epoch": 0.7292740356882808, "grad_norm": 0.4047021269798279, "learning_rate": 1.8004083968157953e-06, "loss": 0.896, "step": 3615 }, { "epoch": 0.7294757712444877, "grad_norm": 0.4339413642883301, "learning_rate": 1.7978980186455236e-06, "loss": 0.7205, "step": 3616 }, { "epoch": 0.7296775068006948, "grad_norm": 0.3869302570819855, "learning_rate": 1.7953890081620174e-06, "loss": 0.7952, "step": 3617 }, { "epoch": 0.7298792423569017, "grad_norm": 0.6974090933799744, "learning_rate": 1.7928813664369339e-06, "loss": 0.6494, "step": 3618 }, { "epoch": 0.7300809779131087, "grad_norm": 0.6910544037818909, "learning_rate": 1.790375094541335e-06, "loss": 0.6995, "step": 3619 }, { "epoch": 0.7302827134693157, "grad_norm": 0.5370467901229858, "learning_rate": 1.7878701935457076e-06, "loss": 0.7044, "step": 3620 }, { "epoch": 0.7304844490255227, "grad_norm": 0.5953854322433472, "learning_rate": 1.7853666645199474e-06, "loss": 0.8002, "step": 3621 }, { "epoch": 0.7306861845817297, "grad_norm": 0.4186118543148041, "learning_rate": 1.7828645085333645e-06, "loss": 0.7228, "step": 3622 }, { "epoch": 0.7308879201379367, "grad_norm": 0.38932132720947266, "learning_rate": 1.7803637266546864e-06, "loss": 0.7806, "step": 3623 }, { "epoch": 0.7310896556941436, "grad_norm": 0.6305902600288391, "learning_rate": 1.7778643199520496e-06, "loss": 0.6624, "step": 3624 }, { "epoch": 0.7312913912503507, "grad_norm": 0.37308868765830994, "learning_rate": 1.775366289493003e-06, "loss": 0.6912, "step": 3625 }, { "epoch": 0.7314931268065576, "grad_norm": 0.5473910570144653, "learning_rate": 1.772869636344512e-06, "loss": 0.6916, "step": 3626 }, { "epoch": 0.7316948623627647, "grad_norm": 0.8070292472839355, "learning_rate": 1.7703743615729501e-06, "loss": 0.7116, "step": 3627 }, { "epoch": 0.7318965979189717, "grad_norm": 0.7206965684890747, "learning_rate": 1.7678804662441019e-06, "loss": 0.6904, "step": 3628 }, { "epoch": 0.7320983334751786, "grad_norm": 0.31374138593673706, "learning_rate": 1.7653879514231631e-06, "loss": 0.7582, "step": 3629 }, { "epoch": 0.7323000690313857, "grad_norm": 0.5226505994796753, "learning_rate": 1.7628968181747435e-06, "loss": 0.656, "step": 3630 }, { "epoch": 0.7325018045875926, "grad_norm": 0.6887264847755432, "learning_rate": 1.760407067562858e-06, "loss": 0.8081, "step": 3631 }, { "epoch": 0.7327035401437997, "grad_norm": 0.4167690575122833, "learning_rate": 1.757918700650933e-06, "loss": 1.3488, "step": 3632 }, { "epoch": 0.7329052757000066, "grad_norm": 0.3777919113636017, "learning_rate": 1.7554317185018016e-06, "loss": 0.737, "step": 3633 }, { "epoch": 0.7331070112562136, "grad_norm": 0.3365021347999573, "learning_rate": 1.7529461221777117e-06, "loss": 0.6321, "step": 3634 }, { "epoch": 0.7333087468124206, "grad_norm": 0.5992380380630493, "learning_rate": 1.7504619127403122e-06, "loss": 0.7048, "step": 3635 }, { "epoch": 0.7335104823686276, "grad_norm": 0.5630524754524231, "learning_rate": 1.7479790912506628e-06, "loss": 0.6618, "step": 3636 }, { "epoch": 0.7337122179248345, "grad_norm": 0.8216602206230164, "learning_rate": 1.745497658769229e-06, "loss": 0.6668, "step": 3637 }, { "epoch": 0.7339139534810416, "grad_norm": 0.6757683157920837, "learning_rate": 1.743017616355887e-06, "loss": 0.6826, "step": 3638 }, { "epoch": 0.7341156890372486, "grad_norm": 0.45060646533966064, "learning_rate": 1.740538965069915e-06, "loss": 0.7319, "step": 3639 }, { "epoch": 0.7343174245934556, "grad_norm": 0.4027143716812134, "learning_rate": 1.7380617059699961e-06, "loss": 0.816, "step": 3640 }, { "epoch": 0.7345191601496626, "grad_norm": 0.6248337626457214, "learning_rate": 1.735585840114225e-06, "loss": 0.6808, "step": 3641 }, { "epoch": 0.7347208957058695, "grad_norm": 0.3696412146091461, "learning_rate": 1.7331113685600954e-06, "loss": 0.6959, "step": 3642 }, { "epoch": 0.7349226312620766, "grad_norm": 0.6471091508865356, "learning_rate": 1.7306382923645054e-06, "loss": 0.6513, "step": 3643 }, { "epoch": 0.7351243668182835, "grad_norm": 0.31260403990745544, "learning_rate": 1.7281666125837637e-06, "loss": 0.6608, "step": 3644 }, { "epoch": 0.7353261023744906, "grad_norm": 0.45326605439186096, "learning_rate": 1.7256963302735752e-06, "loss": 0.7701, "step": 3645 }, { "epoch": 0.7355278379306975, "grad_norm": 0.39521533250808716, "learning_rate": 1.7232274464890509e-06, "loss": 0.6616, "step": 3646 }, { "epoch": 0.7357295734869045, "grad_norm": 0.5993699431419373, "learning_rate": 1.7207599622847042e-06, "loss": 0.7022, "step": 3647 }, { "epoch": 0.7359313090431115, "grad_norm": 2.394320487976074, "learning_rate": 1.7182938787144498e-06, "loss": 0.6558, "step": 3648 }, { "epoch": 0.7361330445993185, "grad_norm": 0.6650750041007996, "learning_rate": 1.7158291968316076e-06, "loss": 0.8178, "step": 3649 }, { "epoch": 0.7363347801555256, "grad_norm": 0.7132764458656311, "learning_rate": 1.7133659176888956e-06, "loss": 0.6896, "step": 3650 }, { "epoch": 0.7365365157117325, "grad_norm": 0.5961502194404602, "learning_rate": 1.710904042338431e-06, "loss": 0.6361, "step": 3651 }, { "epoch": 0.7367382512679395, "grad_norm": 0.3709481358528137, "learning_rate": 1.7084435718317372e-06, "loss": 0.7271, "step": 3652 }, { "epoch": 0.7369399868241465, "grad_norm": 1.152391791343689, "learning_rate": 1.705984507219733e-06, "loss": 0.6716, "step": 3653 }, { "epoch": 0.7371417223803535, "grad_norm": 0.5584093332290649, "learning_rate": 1.7035268495527358e-06, "loss": 0.7522, "step": 3654 }, { "epoch": 0.7373434579365604, "grad_norm": 0.4153057932853699, "learning_rate": 1.7010705998804694e-06, "loss": 0.6646, "step": 3655 }, { "epoch": 0.7375451934927675, "grad_norm": 0.29575854539871216, "learning_rate": 1.6986157592520442e-06, "loss": 0.7195, "step": 3656 }, { "epoch": 0.7377469290489744, "grad_norm": 0.6873073577880859, "learning_rate": 1.6961623287159784e-06, "loss": 0.6508, "step": 3657 }, { "epoch": 0.7379486646051815, "grad_norm": 0.5382766127586365, "learning_rate": 1.6937103093201895e-06, "loss": 0.7678, "step": 3658 }, { "epoch": 0.7381504001613884, "grad_norm": 0.8277750015258789, "learning_rate": 1.6912597021119802e-06, "loss": 0.7059, "step": 3659 }, { "epoch": 0.7383521357175954, "grad_norm": 1.0506656169891357, "learning_rate": 1.6888105081380628e-06, "loss": 0.6723, "step": 3660 }, { "epoch": 0.7385538712738025, "grad_norm": 0.4117637276649475, "learning_rate": 1.68636272844454e-06, "loss": 0.6578, "step": 3661 }, { "epoch": 0.7387556068300094, "grad_norm": 0.8198413252830505, "learning_rate": 1.6839163640769084e-06, "loss": 0.6617, "step": 3662 }, { "epoch": 0.7389573423862165, "grad_norm": 0.3305530250072479, "learning_rate": 1.6814714160800683e-06, "loss": 1.0195, "step": 3663 }, { "epoch": 0.7391590779424234, "grad_norm": 0.41136759519577026, "learning_rate": 1.6790278854983033e-06, "loss": 0.6396, "step": 3664 }, { "epoch": 0.7393608134986304, "grad_norm": 0.9290247559547424, "learning_rate": 1.6765857733753016e-06, "loss": 0.7637, "step": 3665 }, { "epoch": 0.7395625490548374, "grad_norm": 0.8298892378807068, "learning_rate": 1.6741450807541448e-06, "loss": 0.7454, "step": 3666 }, { "epoch": 0.7397642846110444, "grad_norm": 0.7271751165390015, "learning_rate": 1.671705808677298e-06, "loss": 0.6321, "step": 3667 }, { "epoch": 0.7399660201672514, "grad_norm": 0.4080258011817932, "learning_rate": 1.6692679581866334e-06, "loss": 0.6645, "step": 3668 }, { "epoch": 0.7401677557234584, "grad_norm": 0.4282958507537842, "learning_rate": 1.6668315303234068e-06, "loss": 0.7104, "step": 3669 }, { "epoch": 0.7403694912796653, "grad_norm": 0.7578421235084534, "learning_rate": 1.6643965261282675e-06, "loss": 0.6384, "step": 3670 }, { "epoch": 0.7405712268358724, "grad_norm": 1.0067808628082275, "learning_rate": 1.6619629466412613e-06, "loss": 0.8314, "step": 3671 }, { "epoch": 0.7407729623920793, "grad_norm": 0.4014785885810852, "learning_rate": 1.6595307929018216e-06, "loss": 0.6424, "step": 3672 }, { "epoch": 0.7409746979482863, "grad_norm": 0.5424908995628357, "learning_rate": 1.6571000659487719e-06, "loss": 0.8547, "step": 3673 }, { "epoch": 0.7411764335044934, "grad_norm": 0.32863613963127136, "learning_rate": 1.6546707668203322e-06, "loss": 0.6912, "step": 3674 }, { "epoch": 0.7413781690607003, "grad_norm": 0.7613126635551453, "learning_rate": 1.652242896554102e-06, "loss": 0.7847, "step": 3675 }, { "epoch": 0.7415799046169074, "grad_norm": 0.38640883564949036, "learning_rate": 1.6498164561870834e-06, "loss": 0.7502, "step": 3676 }, { "epoch": 0.7417816401731143, "grad_norm": 0.5415723323822021, "learning_rate": 1.6473914467556578e-06, "loss": 0.6831, "step": 3677 }, { "epoch": 0.7419833757293213, "grad_norm": 0.5771734714508057, "learning_rate": 1.644967869295599e-06, "loss": 0.6915, "step": 3678 }, { "epoch": 0.7421851112855283, "grad_norm": 0.30845147371292114, "learning_rate": 1.6425457248420712e-06, "loss": 0.6984, "step": 3679 }, { "epoch": 0.7423868468417353, "grad_norm": 0.2992367148399353, "learning_rate": 1.6401250144296239e-06, "loss": 0.7463, "step": 3680 }, { "epoch": 0.7425885823979423, "grad_norm": 0.38862472772598267, "learning_rate": 1.6377057390921919e-06, "loss": 0.691, "step": 3681 }, { "epoch": 0.7427903179541493, "grad_norm": 0.6588892340660095, "learning_rate": 1.6352878998631044e-06, "loss": 0.6742, "step": 3682 }, { "epoch": 0.7429920535103562, "grad_norm": 0.35232171416282654, "learning_rate": 1.6328714977750698e-06, "loss": 0.6368, "step": 3683 }, { "epoch": 0.7431937890665633, "grad_norm": 0.7427819967269897, "learning_rate": 1.6304565338601864e-06, "loss": 0.7213, "step": 3684 }, { "epoch": 0.7433955246227703, "grad_norm": 0.35415828227996826, "learning_rate": 1.628043009149935e-06, "loss": 0.791, "step": 3685 }, { "epoch": 0.7435972601789773, "grad_norm": 0.4176258444786072, "learning_rate": 1.6256309246751879e-06, "loss": 0.6655, "step": 3686 }, { "epoch": 0.7437989957351843, "grad_norm": 0.45703744888305664, "learning_rate": 1.6232202814661963e-06, "loss": 0.7076, "step": 3687 }, { "epoch": 0.7440007312913912, "grad_norm": 0.5029721260070801, "learning_rate": 1.6208110805525983e-06, "loss": 0.8178, "step": 3688 }, { "epoch": 0.7442024668475983, "grad_norm": 0.647709310054779, "learning_rate": 1.6184033229634134e-06, "loss": 0.6565, "step": 3689 }, { "epoch": 0.7444042024038052, "grad_norm": 0.6276201009750366, "learning_rate": 1.61599700972705e-06, "loss": 0.6759, "step": 3690 }, { "epoch": 0.7446059379600122, "grad_norm": 0.6077056527137756, "learning_rate": 1.6135921418712959e-06, "loss": 0.666, "step": 3691 }, { "epoch": 0.7448076735162192, "grad_norm": 0.5409219264984131, "learning_rate": 1.6111887204233184e-06, "loss": 0.7746, "step": 3692 }, { "epoch": 0.7450094090724262, "grad_norm": 0.38843217492103577, "learning_rate": 1.608786746409675e-06, "loss": 0.7357, "step": 3693 }, { "epoch": 0.7452111446286332, "grad_norm": 0.5379346013069153, "learning_rate": 1.606386220856299e-06, "loss": 0.6834, "step": 3694 }, { "epoch": 0.7454128801848402, "grad_norm": 0.7650083899497986, "learning_rate": 1.603987144788507e-06, "loss": 0.8181, "step": 3695 }, { "epoch": 0.7456146157410471, "grad_norm": 0.48006683588027954, "learning_rate": 1.6015895192309933e-06, "loss": 0.7082, "step": 3696 }, { "epoch": 0.7458163512972542, "grad_norm": 0.42755845189094543, "learning_rate": 1.5991933452078396e-06, "loss": 0.6766, "step": 3697 }, { "epoch": 0.7460180868534612, "grad_norm": 0.4909815192222595, "learning_rate": 1.596798623742501e-06, "loss": 0.7053, "step": 3698 }, { "epoch": 0.7462198224096682, "grad_norm": 0.4889410138130188, "learning_rate": 1.5944053558578144e-06, "loss": 0.6501, "step": 3699 }, { "epoch": 0.7464215579658752, "grad_norm": 0.6778377890586853, "learning_rate": 1.5920135425759974e-06, "loss": 0.7152, "step": 3700 }, { "epoch": 0.7466232935220821, "grad_norm": 2.5792107582092285, "learning_rate": 1.5896231849186456e-06, "loss": 0.7679, "step": 3701 }, { "epoch": 0.7468250290782892, "grad_norm": 0.7936316132545471, "learning_rate": 1.5872342839067305e-06, "loss": 0.6517, "step": 3702 }, { "epoch": 0.7470267646344961, "grad_norm": 0.5701306462287903, "learning_rate": 1.5848468405606038e-06, "loss": 0.7991, "step": 3703 }, { "epoch": 0.7472285001907032, "grad_norm": 0.4055497646331787, "learning_rate": 1.5824608558999927e-06, "loss": 0.6636, "step": 3704 }, { "epoch": 0.7474302357469101, "grad_norm": 0.34075456857681274, "learning_rate": 1.5800763309440053e-06, "loss": 0.8077, "step": 3705 }, { "epoch": 0.7476319713031171, "grad_norm": 0.7685133218765259, "learning_rate": 1.5776932667111228e-06, "loss": 0.6494, "step": 3706 }, { "epoch": 0.7478337068593242, "grad_norm": 0.39524295926094055, "learning_rate": 1.5753116642192013e-06, "loss": 1.0224, "step": 3707 }, { "epoch": 0.7480354424155311, "grad_norm": 0.4503965377807617, "learning_rate": 1.572931524485477e-06, "loss": 0.6458, "step": 3708 }, { "epoch": 0.748237177971738, "grad_norm": 0.48717793822288513, "learning_rate": 1.5705528485265586e-06, "loss": 0.6641, "step": 3709 }, { "epoch": 0.7484389135279451, "grad_norm": 0.3593592941761017, "learning_rate": 1.5681756373584272e-06, "loss": 0.9094, "step": 3710 }, { "epoch": 0.7486406490841521, "grad_norm": 0.5503544211387634, "learning_rate": 1.5657998919964462e-06, "loss": 0.6513, "step": 3711 }, { "epoch": 0.7488423846403591, "grad_norm": 0.45195677876472473, "learning_rate": 1.5634256134553416e-06, "loss": 0.6276, "step": 3712 }, { "epoch": 0.7490441201965661, "grad_norm": 0.4969177842140198, "learning_rate": 1.561052802749221e-06, "loss": 0.7005, "step": 3713 }, { "epoch": 0.749245855752773, "grad_norm": 0.6967068314552307, "learning_rate": 1.5586814608915673e-06, "loss": 0.6904, "step": 3714 }, { "epoch": 0.7494475913089801, "grad_norm": 0.42649900913238525, "learning_rate": 1.5563115888952252e-06, "loss": 0.8253, "step": 3715 }, { "epoch": 0.749649326865187, "grad_norm": 0.5606639981269836, "learning_rate": 1.553943187772422e-06, "loss": 0.8229, "step": 3716 }, { "epoch": 0.7498510624213941, "grad_norm": 0.4556725025177002, "learning_rate": 1.5515762585347526e-06, "loss": 0.6659, "step": 3717 }, { "epoch": 0.750052797977601, "grad_norm": 0.3770776391029358, "learning_rate": 1.5492108021931806e-06, "loss": 0.7185, "step": 3718 }, { "epoch": 0.750254533533808, "grad_norm": 1.0561859607696533, "learning_rate": 1.5468468197580478e-06, "loss": 0.7136, "step": 3719 }, { "epoch": 0.7504562690900151, "grad_norm": 0.6821918487548828, "learning_rate": 1.544484312239059e-06, "loss": 0.6605, "step": 3720 }, { "epoch": 0.750658004646222, "grad_norm": 0.5854676365852356, "learning_rate": 1.542123280645292e-06, "loss": 0.6434, "step": 3721 }, { "epoch": 0.7508597402024291, "grad_norm": 0.6247336864471436, "learning_rate": 1.5397637259851977e-06, "loss": 0.6861, "step": 3722 }, { "epoch": 0.751061475758636, "grad_norm": 0.3600952625274658, "learning_rate": 1.5374056492665879e-06, "loss": 0.7089, "step": 3723 }, { "epoch": 0.751263211314843, "grad_norm": 0.34228378534317017, "learning_rate": 1.5350490514966509e-06, "loss": 0.7864, "step": 3724 }, { "epoch": 0.75146494687105, "grad_norm": 0.7408826947212219, "learning_rate": 1.5326939336819408e-06, "loss": 0.6609, "step": 3725 }, { "epoch": 0.751666682427257, "grad_norm": 0.31936943531036377, "learning_rate": 1.5303402968283758e-06, "loss": 0.6578, "step": 3726 }, { "epoch": 0.751868417983464, "grad_norm": 0.4063432216644287, "learning_rate": 1.527988141941249e-06, "loss": 0.6571, "step": 3727 }, { "epoch": 0.752070153539671, "grad_norm": 0.3507099449634552, "learning_rate": 1.5256374700252151e-06, "loss": 0.6911, "step": 3728 }, { "epoch": 0.7522718890958779, "grad_norm": 0.4853664040565491, "learning_rate": 1.5232882820842948e-06, "loss": 0.6991, "step": 3729 }, { "epoch": 0.752473624652085, "grad_norm": 0.3933629095554352, "learning_rate": 1.520940579121881e-06, "loss": 0.6584, "step": 3730 }, { "epoch": 0.752675360208292, "grad_norm": 0.37084707617759705, "learning_rate": 1.5185943621407233e-06, "loss": 0.655, "step": 3731 }, { "epoch": 0.7528770957644989, "grad_norm": 0.3501735329627991, "learning_rate": 1.5162496321429438e-06, "loss": 0.6804, "step": 3732 }, { "epoch": 0.753078831320706, "grad_norm": 0.5413325428962708, "learning_rate": 1.5139063901300298e-06, "loss": 0.6807, "step": 3733 }, { "epoch": 0.7532805668769129, "grad_norm": 0.3382270038127899, "learning_rate": 1.5115646371028258e-06, "loss": 0.6519, "step": 3734 }, { "epoch": 0.75348230243312, "grad_norm": 0.37832415103912354, "learning_rate": 1.5092243740615486e-06, "loss": 0.6879, "step": 3735 }, { "epoch": 0.7536840379893269, "grad_norm": 0.4899078905582428, "learning_rate": 1.5068856020057732e-06, "loss": 0.6914, "step": 3736 }, { "epoch": 0.7538857735455339, "grad_norm": 0.786450207233429, "learning_rate": 1.5045483219344387e-06, "loss": 0.6993, "step": 3737 }, { "epoch": 0.7540875091017409, "grad_norm": 0.6078961491584778, "learning_rate": 1.5022125348458504e-06, "loss": 0.6498, "step": 3738 }, { "epoch": 0.7542892446579479, "grad_norm": 0.39420047402381897, "learning_rate": 1.4998782417376723e-06, "loss": 0.6692, "step": 3739 }, { "epoch": 0.754490980214155, "grad_norm": 0.7297521233558655, "learning_rate": 1.4975454436069292e-06, "loss": 0.6502, "step": 3740 }, { "epoch": 0.7546927157703619, "grad_norm": 0.4037453830242157, "learning_rate": 1.4952141414500143e-06, "loss": 0.6704, "step": 3741 }, { "epoch": 0.7548944513265688, "grad_norm": 0.5245139002799988, "learning_rate": 1.4928843362626705e-06, "loss": 0.6671, "step": 3742 }, { "epoch": 0.7550961868827759, "grad_norm": 0.5826756954193115, "learning_rate": 1.4905560290400128e-06, "loss": 0.642, "step": 3743 }, { "epoch": 0.7552979224389829, "grad_norm": 0.3041967749595642, "learning_rate": 1.4882292207765104e-06, "loss": 0.7224, "step": 3744 }, { "epoch": 0.7554996579951899, "grad_norm": 0.31114375591278076, "learning_rate": 1.4859039124659908e-06, "loss": 0.7237, "step": 3745 }, { "epoch": 0.7557013935513969, "grad_norm": 0.8395849466323853, "learning_rate": 1.4835801051016463e-06, "loss": 0.6635, "step": 3746 }, { "epoch": 0.7559031291076038, "grad_norm": 0.3680986166000366, "learning_rate": 1.4812577996760242e-06, "loss": 0.8131, "step": 3747 }, { "epoch": 0.7561048646638109, "grad_norm": 1.124172329902649, "learning_rate": 1.4789369971810298e-06, "loss": 0.6186, "step": 3748 }, { "epoch": 0.7563066002200178, "grad_norm": 0.3392429053783417, "learning_rate": 1.47661769860793e-06, "loss": 0.6952, "step": 3749 }, { "epoch": 0.7565083357762248, "grad_norm": 0.37163037061691284, "learning_rate": 1.474299904947346e-06, "loss": 0.7324, "step": 3750 }, { "epoch": 0.7567100713324318, "grad_norm": 0.3731152415275574, "learning_rate": 1.471983617189258e-06, "loss": 0.6832, "step": 3751 }, { "epoch": 0.7569118068886388, "grad_norm": 1.5635799169540405, "learning_rate": 1.469668836323001e-06, "loss": 0.6204, "step": 3752 }, { "epoch": 0.7571135424448459, "grad_norm": 0.32818305492401123, "learning_rate": 1.4673555633372699e-06, "loss": 0.6593, "step": 3753 }, { "epoch": 0.7573152780010528, "grad_norm": 0.5359674692153931, "learning_rate": 1.4650437992201122e-06, "loss": 0.6488, "step": 3754 }, { "epoch": 0.7575170135572598, "grad_norm": 0.39580339193344116, "learning_rate": 1.4627335449589331e-06, "loss": 0.665, "step": 3755 }, { "epoch": 0.7577187491134668, "grad_norm": 0.3998722732067108, "learning_rate": 1.4604248015404886e-06, "loss": 0.6748, "step": 3756 }, { "epoch": 0.7579204846696738, "grad_norm": 0.3832513093948364, "learning_rate": 1.4581175699508982e-06, "loss": 0.6579, "step": 3757 }, { "epoch": 0.7581222202258808, "grad_norm": 0.7050317525863647, "learning_rate": 1.455811851175627e-06, "loss": 0.6625, "step": 3758 }, { "epoch": 0.7583239557820878, "grad_norm": 0.37937480211257935, "learning_rate": 1.4535076461994974e-06, "loss": 0.6636, "step": 3759 }, { "epoch": 0.7585256913382947, "grad_norm": 0.7320890426635742, "learning_rate": 1.4512049560066837e-06, "loss": 0.652, "step": 3760 }, { "epoch": 0.7587274268945018, "grad_norm": 0.39705395698547363, "learning_rate": 1.4489037815807178e-06, "loss": 0.6796, "step": 3761 }, { "epoch": 0.7589291624507087, "grad_norm": 0.48778223991394043, "learning_rate": 1.4466041239044792e-06, "loss": 0.7062, "step": 3762 }, { "epoch": 0.7591308980069158, "grad_norm": 0.40511614084243774, "learning_rate": 1.4443059839601998e-06, "loss": 0.6851, "step": 3763 }, { "epoch": 0.7593326335631227, "grad_norm": 0.40412962436676025, "learning_rate": 1.4420093627294673e-06, "loss": 0.824, "step": 3764 }, { "epoch": 0.7595343691193297, "grad_norm": 0.6214047074317932, "learning_rate": 1.4397142611932174e-06, "loss": 0.6574, "step": 3765 }, { "epoch": 0.7597361046755368, "grad_norm": 0.7142421007156372, "learning_rate": 1.4374206803317354e-06, "loss": 0.6656, "step": 3766 }, { "epoch": 0.7599378402317437, "grad_norm": 0.49294382333755493, "learning_rate": 1.4351286211246618e-06, "loss": 0.7274, "step": 3767 }, { "epoch": 0.7601395757879507, "grad_norm": 0.7711848616600037, "learning_rate": 1.4328380845509837e-06, "loss": 0.6402, "step": 3768 }, { "epoch": 0.7603413113441577, "grad_norm": 3.8337807655334473, "learning_rate": 1.430549071589038e-06, "loss": 0.6335, "step": 3769 }, { "epoch": 0.7605430469003647, "grad_norm": 0.3326573669910431, "learning_rate": 1.428261583216512e-06, "loss": 0.7242, "step": 3770 }, { "epoch": 0.7607447824565717, "grad_norm": 0.4140056073665619, "learning_rate": 1.4259756204104396e-06, "loss": 0.6446, "step": 3771 }, { "epoch": 0.7609465180127787, "grad_norm": 0.42758747935295105, "learning_rate": 1.4236911841472074e-06, "loss": 0.6707, "step": 3772 }, { "epoch": 0.7611482535689856, "grad_norm": 0.5065056085586548, "learning_rate": 1.4214082754025466e-06, "loss": 0.6566, "step": 3773 }, { "epoch": 0.7613499891251927, "grad_norm": 0.48952701687812805, "learning_rate": 1.4191268951515348e-06, "loss": 0.6657, "step": 3774 }, { "epoch": 0.7615517246813996, "grad_norm": 1.1300623416900635, "learning_rate": 1.4168470443686017e-06, "loss": 0.6557, "step": 3775 }, { "epoch": 0.7617534602376067, "grad_norm": 0.532576322555542, "learning_rate": 1.414568724027519e-06, "loss": 0.6368, "step": 3776 }, { "epoch": 0.7619551957938137, "grad_norm": 0.5727057456970215, "learning_rate": 1.4122919351014052e-06, "loss": 0.6731, "step": 3777 }, { "epoch": 0.7621569313500206, "grad_norm": 1.5019340515136719, "learning_rate": 1.4100166785627301e-06, "loss": 0.6507, "step": 3778 }, { "epoch": 0.7623586669062277, "grad_norm": 0.39014142751693726, "learning_rate": 1.4077429553832995e-06, "loss": 0.7623, "step": 3779 }, { "epoch": 0.7625604024624346, "grad_norm": 0.36659300327301025, "learning_rate": 1.4054707665342721e-06, "loss": 0.733, "step": 3780 }, { "epoch": 0.7627621380186417, "grad_norm": 0.48996803164482117, "learning_rate": 1.403200112986151e-06, "loss": 0.6413, "step": 3781 }, { "epoch": 0.7629638735748486, "grad_norm": 0.33237341046333313, "learning_rate": 1.400930995708777e-06, "loss": 0.6667, "step": 3782 }, { "epoch": 0.7631656091310556, "grad_norm": 0.3961053192615509, "learning_rate": 1.3986634156713418e-06, "loss": 0.6613, "step": 3783 }, { "epoch": 0.7633673446872626, "grad_norm": 0.44538700580596924, "learning_rate": 1.3963973738423774e-06, "loss": 0.8809, "step": 3784 }, { "epoch": 0.7635690802434696, "grad_norm": 0.5624479055404663, "learning_rate": 1.3941328711897568e-06, "loss": 0.6743, "step": 3785 }, { "epoch": 0.7637708157996765, "grad_norm": 0.349399596452713, "learning_rate": 1.391869908680703e-06, "loss": 0.6448, "step": 3786 }, { "epoch": 0.7639725513558836, "grad_norm": 0.3919467329978943, "learning_rate": 1.3896084872817695e-06, "loss": 0.6094, "step": 3787 }, { "epoch": 0.7641742869120906, "grad_norm": 0.45314642786979675, "learning_rate": 1.3873486079588617e-06, "loss": 0.8347, "step": 3788 }, { "epoch": 0.7643760224682976, "grad_norm": 0.7018961906433105, "learning_rate": 1.3850902716772251e-06, "loss": 0.7033, "step": 3789 }, { "epoch": 0.7645777580245046, "grad_norm": 0.3727651536464691, "learning_rate": 1.382833479401438e-06, "loss": 0.6793, "step": 3790 }, { "epoch": 0.7647794935807115, "grad_norm": 0.34330493211746216, "learning_rate": 1.3805782320954297e-06, "loss": 0.8555, "step": 3791 }, { "epoch": 0.7649812291369186, "grad_norm": 0.492082417011261, "learning_rate": 1.3783245307224635e-06, "loss": 0.7536, "step": 3792 }, { "epoch": 0.7651829646931255, "grad_norm": 0.3740497827529907, "learning_rate": 1.3760723762451428e-06, "loss": 0.8049, "step": 3793 }, { "epoch": 0.7653847002493326, "grad_norm": 0.3092051148414612, "learning_rate": 1.373821769625413e-06, "loss": 0.7887, "step": 3794 }, { "epoch": 0.7655864358055395, "grad_norm": 0.3884614109992981, "learning_rate": 1.3715727118245558e-06, "loss": 0.6814, "step": 3795 }, { "epoch": 0.7657881713617465, "grad_norm": 1.359918236732483, "learning_rate": 1.3693252038031912e-06, "loss": 0.6626, "step": 3796 }, { "epoch": 0.7659899069179535, "grad_norm": 0.4398966133594513, "learning_rate": 1.3670792465212828e-06, "loss": 0.6604, "step": 3797 }, { "epoch": 0.7661916424741605, "grad_norm": 0.7652633190155029, "learning_rate": 1.3648348409381208e-06, "loss": 0.675, "step": 3798 }, { "epoch": 0.7663933780303676, "grad_norm": 0.8095738291740417, "learning_rate": 1.3625919880123438e-06, "loss": 0.6632, "step": 3799 }, { "epoch": 0.7665951135865745, "grad_norm": 0.5652156472206116, "learning_rate": 1.3603506887019214e-06, "loss": 0.7734, "step": 3800 }, { "epoch": 0.7667968491427815, "grad_norm": 0.45031893253326416, "learning_rate": 1.3581109439641587e-06, "loss": 0.6172, "step": 3801 }, { "epoch": 0.7669985846989885, "grad_norm": 0.49738746881484985, "learning_rate": 1.3558727547557032e-06, "loss": 0.7687, "step": 3802 }, { "epoch": 0.7672003202551955, "grad_norm": 3.2418699264526367, "learning_rate": 1.3536361220325312e-06, "loss": 0.6724, "step": 3803 }, { "epoch": 0.7674020558114024, "grad_norm": 0.34520092606544495, "learning_rate": 1.3514010467499556e-06, "loss": 0.6646, "step": 3804 }, { "epoch": 0.7676037913676095, "grad_norm": 0.5189719796180725, "learning_rate": 1.3491675298626279e-06, "loss": 0.8133, "step": 3805 }, { "epoch": 0.7678055269238164, "grad_norm": 0.7521995902061462, "learning_rate": 1.3469355723245303e-06, "loss": 0.6499, "step": 3806 }, { "epoch": 0.7680072624800235, "grad_norm": 0.6148568987846375, "learning_rate": 1.3447051750889783e-06, "loss": 1.0773, "step": 3807 }, { "epoch": 0.7682089980362304, "grad_norm": 2.7657034397125244, "learning_rate": 1.3424763391086253e-06, "loss": 0.6823, "step": 3808 }, { "epoch": 0.7684107335924374, "grad_norm": 0.35511335730552673, "learning_rate": 1.3402490653354544e-06, "loss": 0.7114, "step": 3809 }, { "epoch": 0.7686124691486445, "grad_norm": 2.8116512298583984, "learning_rate": 1.338023354720781e-06, "loss": 0.8214, "step": 3810 }, { "epoch": 0.7688142047048514, "grad_norm": 0.5073435306549072, "learning_rate": 1.3357992082152555e-06, "loss": 0.7818, "step": 3811 }, { "epoch": 0.7690159402610585, "grad_norm": 0.4251355230808258, "learning_rate": 1.3335766267688566e-06, "loss": 0.6838, "step": 3812 }, { "epoch": 0.7692176758172654, "grad_norm": 0.3123939633369446, "learning_rate": 1.3313556113308994e-06, "loss": 0.6698, "step": 3813 }, { "epoch": 0.7694194113734724, "grad_norm": 0.44618505239486694, "learning_rate": 1.3291361628500266e-06, "loss": 0.7761, "step": 3814 }, { "epoch": 0.7696211469296794, "grad_norm": 1.669847011566162, "learning_rate": 1.326918282274211e-06, "loss": 0.7316, "step": 3815 }, { "epoch": 0.7698228824858864, "grad_norm": 0.8414665460586548, "learning_rate": 1.3247019705507596e-06, "loss": 0.7481, "step": 3816 }, { "epoch": 0.7700246180420934, "grad_norm": 0.5543730854988098, "learning_rate": 1.3224872286263058e-06, "loss": 0.778, "step": 3817 }, { "epoch": 0.7702263535983004, "grad_norm": 0.7356978058815002, "learning_rate": 1.3202740574468132e-06, "loss": 0.6415, "step": 3818 }, { "epoch": 0.7704280891545073, "grad_norm": 0.37701356410980225, "learning_rate": 1.3180624579575741e-06, "loss": 0.6663, "step": 3819 }, { "epoch": 0.7706298247107144, "grad_norm": 0.37720897793769836, "learning_rate": 1.3158524311032128e-06, "loss": 0.7926, "step": 3820 }, { "epoch": 0.7708315602669213, "grad_norm": 0.6713565587997437, "learning_rate": 1.3136439778276782e-06, "loss": 0.6525, "step": 3821 }, { "epoch": 0.7710332958231284, "grad_norm": 0.8381698131561279, "learning_rate": 1.3114370990742465e-06, "loss": 0.6224, "step": 3822 }, { "epoch": 0.7712350313793354, "grad_norm": 0.3800964653491974, "learning_rate": 1.309231795785526e-06, "loss": 0.6969, "step": 3823 }, { "epoch": 0.7714367669355423, "grad_norm": 0.544260561466217, "learning_rate": 1.3070280689034486e-06, "loss": 0.6555, "step": 3824 }, { "epoch": 0.7716385024917494, "grad_norm": 0.3614006042480469, "learning_rate": 1.304825919369273e-06, "loss": 0.6744, "step": 3825 }, { "epoch": 0.7718402380479563, "grad_norm": 0.4283572733402252, "learning_rate": 1.3026253481235845e-06, "loss": 0.6823, "step": 3826 }, { "epoch": 0.7720419736041633, "grad_norm": 0.4926629960536957, "learning_rate": 1.3004263561062935e-06, "loss": 0.7654, "step": 3827 }, { "epoch": 0.7722437091603703, "grad_norm": 0.5868778228759766, "learning_rate": 1.2982289442566392e-06, "loss": 0.6969, "step": 3828 }, { "epoch": 0.7724454447165773, "grad_norm": 0.3782004117965698, "learning_rate": 1.2960331135131826e-06, "loss": 0.6601, "step": 3829 }, { "epoch": 0.7726471802727843, "grad_norm": 0.5834161639213562, "learning_rate": 1.2938388648138089e-06, "loss": 0.6755, "step": 3830 }, { "epoch": 0.7728489158289913, "grad_norm": 0.3004898726940155, "learning_rate": 1.291646199095732e-06, "loss": 0.7482, "step": 3831 }, { "epoch": 0.7730506513851982, "grad_norm": 0.8956354856491089, "learning_rate": 1.289455117295485e-06, "loss": 0.661, "step": 3832 }, { "epoch": 0.7732523869414053, "grad_norm": 0.34841060638427734, "learning_rate": 1.2872656203489242e-06, "loss": 0.6674, "step": 3833 }, { "epoch": 0.7734541224976123, "grad_norm": 0.36636319756507874, "learning_rate": 1.2850777091912364e-06, "loss": 0.7774, "step": 3834 }, { "epoch": 0.7736558580538193, "grad_norm": 0.3407168388366699, "learning_rate": 1.2828913847569185e-06, "loss": 0.6519, "step": 3835 }, { "epoch": 0.7738575936100263, "grad_norm": 0.41359943151474, "learning_rate": 1.2807066479798013e-06, "loss": 0.7158, "step": 3836 }, { "epoch": 0.7740593291662332, "grad_norm": 0.3872455656528473, "learning_rate": 1.2785234997930345e-06, "loss": 0.6615, "step": 3837 }, { "epoch": 0.7742610647224403, "grad_norm": 0.4519384205341339, "learning_rate": 1.2763419411290823e-06, "loss": 0.6828, "step": 3838 }, { "epoch": 0.7744628002786472, "grad_norm": 1.394004464149475, "learning_rate": 1.2741619729197403e-06, "loss": 0.812, "step": 3839 }, { "epoch": 0.7746645358348543, "grad_norm": 0.38254714012145996, "learning_rate": 1.2719835960961173e-06, "loss": 0.6816, "step": 3840 }, { "epoch": 0.7748662713910612, "grad_norm": 0.390522837638855, "learning_rate": 1.2698068115886453e-06, "loss": 0.6515, "step": 3841 }, { "epoch": 0.7750680069472682, "grad_norm": 0.47609326243400574, "learning_rate": 1.2676316203270766e-06, "loss": 0.651, "step": 3842 }, { "epoch": 0.7752697425034752, "grad_norm": 0.7987368106842041, "learning_rate": 1.265458023240483e-06, "loss": 0.6626, "step": 3843 }, { "epoch": 0.7754714780596822, "grad_norm": 0.4103650450706482, "learning_rate": 1.2632860212572518e-06, "loss": 0.7753, "step": 3844 }, { "epoch": 0.7756732136158891, "grad_norm": 1.0546495914459229, "learning_rate": 1.2611156153050963e-06, "loss": 0.6805, "step": 3845 }, { "epoch": 0.7758749491720962, "grad_norm": 1.0925642251968384, "learning_rate": 1.2589468063110382e-06, "loss": 0.7166, "step": 3846 }, { "epoch": 0.7760766847283032, "grad_norm": 0.4217831492424011, "learning_rate": 1.2567795952014272e-06, "loss": 0.6609, "step": 3847 }, { "epoch": 0.7762784202845102, "grad_norm": 1.4595791101455688, "learning_rate": 1.2546139829019238e-06, "loss": 0.6633, "step": 3848 }, { "epoch": 0.7764801558407172, "grad_norm": 0.33885642886161804, "learning_rate": 1.2524499703375065e-06, "loss": 0.6822, "step": 3849 }, { "epoch": 0.7766818913969241, "grad_norm": 0.380485475063324, "learning_rate": 1.2502875584324748e-06, "loss": 0.6736, "step": 3850 }, { "epoch": 0.7768836269531312, "grad_norm": 0.486627459526062, "learning_rate": 1.2481267481104398e-06, "loss": 0.6623, "step": 3851 }, { "epoch": 0.7770853625093381, "grad_norm": 0.4864477515220642, "learning_rate": 1.245967540294329e-06, "loss": 0.6338, "step": 3852 }, { "epoch": 0.7772870980655452, "grad_norm": 0.5462962985038757, "learning_rate": 1.2438099359063898e-06, "loss": 0.7502, "step": 3853 }, { "epoch": 0.7774888336217521, "grad_norm": 0.4412570595741272, "learning_rate": 1.2416539358681772e-06, "loss": 0.6794, "step": 3854 }, { "epoch": 0.7776905691779591, "grad_norm": 0.32791343331336975, "learning_rate": 1.2394995411005672e-06, "loss": 0.8837, "step": 3855 }, { "epoch": 0.7778923047341662, "grad_norm": 0.6076446175575256, "learning_rate": 1.237346752523752e-06, "loss": 0.6854, "step": 3856 }, { "epoch": 0.7780940402903731, "grad_norm": 0.3447825610637665, "learning_rate": 1.2351955710572272e-06, "loss": 0.6444, "step": 3857 }, { "epoch": 0.7782957758465802, "grad_norm": 0.6088526844978333, "learning_rate": 1.233045997619814e-06, "loss": 0.6993, "step": 3858 }, { "epoch": 0.7784975114027871, "grad_norm": 0.5636637806892395, "learning_rate": 1.23089803312964e-06, "loss": 0.6693, "step": 3859 }, { "epoch": 0.7786992469589941, "grad_norm": 0.3898247480392456, "learning_rate": 1.2287516785041447e-06, "loss": 0.6597, "step": 3860 }, { "epoch": 0.7789009825152011, "grad_norm": 0.3873283565044403, "learning_rate": 1.2266069346600862e-06, "loss": 0.6447, "step": 3861 }, { "epoch": 0.7791027180714081, "grad_norm": 0.8242816925048828, "learning_rate": 1.224463802513529e-06, "loss": 0.8405, "step": 3862 }, { "epoch": 0.779304453627615, "grad_norm": 0.9445183873176575, "learning_rate": 1.2223222829798503e-06, "loss": 0.672, "step": 3863 }, { "epoch": 0.7795061891838221, "grad_norm": 0.4672654867172241, "learning_rate": 1.2201823769737408e-06, "loss": 0.6316, "step": 3864 }, { "epoch": 0.779707924740029, "grad_norm": 0.43315044045448303, "learning_rate": 1.2180440854092007e-06, "loss": 0.6807, "step": 3865 }, { "epoch": 0.7799096602962361, "grad_norm": 0.47044748067855835, "learning_rate": 1.2159074091995387e-06, "loss": 0.6416, "step": 3866 }, { "epoch": 0.780111395852443, "grad_norm": 0.361098051071167, "learning_rate": 1.2137723492573766e-06, "loss": 0.651, "step": 3867 }, { "epoch": 0.78031313140865, "grad_norm": 0.5044724345207214, "learning_rate": 1.2116389064946427e-06, "loss": 0.6424, "step": 3868 }, { "epoch": 0.7805148669648571, "grad_norm": 0.3531644642353058, "learning_rate": 1.209507081822579e-06, "loss": 0.666, "step": 3869 }, { "epoch": 0.780716602521064, "grad_norm": 0.7237960696220398, "learning_rate": 1.2073768761517325e-06, "loss": 0.6426, "step": 3870 }, { "epoch": 0.7809183380772711, "grad_norm": 0.5849398374557495, "learning_rate": 1.2052482903919577e-06, "loss": 0.758, "step": 3871 }, { "epoch": 0.781120073633478, "grad_norm": 0.47558656334877014, "learning_rate": 1.2031213254524237e-06, "loss": 0.6467, "step": 3872 }, { "epoch": 0.781321809189685, "grad_norm": 0.6503086686134338, "learning_rate": 1.2009959822416012e-06, "loss": 0.7555, "step": 3873 }, { "epoch": 0.781523544745892, "grad_norm": 0.8717564940452576, "learning_rate": 1.1988722616672698e-06, "loss": 0.6304, "step": 3874 }, { "epoch": 0.781725280302099, "grad_norm": 0.5620495676994324, "learning_rate": 1.1967501646365147e-06, "loss": 0.6471, "step": 3875 }, { "epoch": 0.781927015858306, "grad_norm": 0.46963924169540405, "learning_rate": 1.1946296920557327e-06, "loss": 0.6977, "step": 3876 }, { "epoch": 0.782128751414513, "grad_norm": 0.4893326461315155, "learning_rate": 1.1925108448306217e-06, "loss": 0.7063, "step": 3877 }, { "epoch": 0.7823304869707199, "grad_norm": 1.2923660278320312, "learning_rate": 1.1903936238661868e-06, "loss": 0.7065, "step": 3878 }, { "epoch": 0.782532222526927, "grad_norm": 1.1199201345443726, "learning_rate": 1.1882780300667374e-06, "loss": 0.6457, "step": 3879 }, { "epoch": 0.782733958083134, "grad_norm": 0.42425259947776794, "learning_rate": 1.1861640643358925e-06, "loss": 0.6513, "step": 3880 }, { "epoch": 0.7829356936393409, "grad_norm": 0.8278416991233826, "learning_rate": 1.18405172757657e-06, "loss": 0.6506, "step": 3881 }, { "epoch": 0.783137429195548, "grad_norm": 1.3610056638717651, "learning_rate": 1.1819410206909942e-06, "loss": 0.5924, "step": 3882 }, { "epoch": 0.7833391647517549, "grad_norm": 1.611168384552002, "learning_rate": 1.1798319445806955e-06, "loss": 0.6485, "step": 3883 }, { "epoch": 0.783540900307962, "grad_norm": 0.36074623465538025, "learning_rate": 1.1777245001465048e-06, "loss": 0.6858, "step": 3884 }, { "epoch": 0.7837426358641689, "grad_norm": 0.9947795867919922, "learning_rate": 1.1756186882885566e-06, "loss": 0.7189, "step": 3885 }, { "epoch": 0.7839443714203759, "grad_norm": 0.6028972268104553, "learning_rate": 1.1735145099062872e-06, "loss": 0.7583, "step": 3886 }, { "epoch": 0.7841461069765829, "grad_norm": 1.2518178224563599, "learning_rate": 1.1714119658984402e-06, "loss": 0.694, "step": 3887 }, { "epoch": 0.7843478425327899, "grad_norm": 0.311381995677948, "learning_rate": 1.169311057163055e-06, "loss": 1.0158, "step": 3888 }, { "epoch": 0.784549578088997, "grad_norm": 0.5026519298553467, "learning_rate": 1.167211784597474e-06, "loss": 0.6602, "step": 3889 }, { "epoch": 0.7847513136452039, "grad_norm": 0.7679435610771179, "learning_rate": 1.1651141490983442e-06, "loss": 0.6701, "step": 3890 }, { "epoch": 0.7849530492014108, "grad_norm": 0.34757447242736816, "learning_rate": 1.1630181515616102e-06, "loss": 0.6911, "step": 3891 }, { "epoch": 0.7851547847576179, "grad_norm": 0.4210948348045349, "learning_rate": 1.1609237928825174e-06, "loss": 0.7271, "step": 3892 }, { "epoch": 0.7853565203138249, "grad_norm": 0.3013748824596405, "learning_rate": 1.1588310739556113e-06, "loss": 0.6697, "step": 3893 }, { "epoch": 0.7855582558700319, "grad_norm": 0.8086436986923218, "learning_rate": 1.156739995674736e-06, "loss": 0.6954, "step": 3894 }, { "epoch": 0.7857599914262389, "grad_norm": 0.5072692632675171, "learning_rate": 1.1546505589330391e-06, "loss": 0.772, "step": 3895 }, { "epoch": 0.7859617269824458, "grad_norm": 0.7558883428573608, "learning_rate": 1.152562764622963e-06, "loss": 0.6673, "step": 3896 }, { "epoch": 0.7861634625386529, "grad_norm": 0.8792096972465515, "learning_rate": 1.1504766136362471e-06, "loss": 0.6173, "step": 3897 }, { "epoch": 0.7863651980948598, "grad_norm": 0.7532789707183838, "learning_rate": 1.1483921068639353e-06, "loss": 0.6114, "step": 3898 }, { "epoch": 0.7865669336510668, "grad_norm": 0.6622703075408936, "learning_rate": 1.1463092451963637e-06, "loss": 1.0339, "step": 3899 }, { "epoch": 0.7867686692072738, "grad_norm": 0.4823106825351715, "learning_rate": 1.1442280295231656e-06, "loss": 0.7618, "step": 3900 }, { "epoch": 0.7869704047634808, "grad_norm": 0.4021747410297394, "learning_rate": 1.1421484607332778e-06, "loss": 0.6722, "step": 3901 }, { "epoch": 0.7871721403196879, "grad_norm": 0.5340956449508667, "learning_rate": 1.1400705397149226e-06, "loss": 0.8126, "step": 3902 }, { "epoch": 0.7873738758758948, "grad_norm": 0.3946157693862915, "learning_rate": 1.1379942673556287e-06, "loss": 0.6543, "step": 3903 }, { "epoch": 0.7875756114321018, "grad_norm": 0.7702094316482544, "learning_rate": 1.1359196445422187e-06, "loss": 0.7597, "step": 3904 }, { "epoch": 0.7877773469883088, "grad_norm": 0.3735303580760956, "learning_rate": 1.1338466721608039e-06, "loss": 0.6527, "step": 3905 }, { "epoch": 0.7879790825445158, "grad_norm": 0.4215957522392273, "learning_rate": 1.1317753510967989e-06, "loss": 0.6755, "step": 3906 }, { "epoch": 0.7881808181007228, "grad_norm": 2.6547322273254395, "learning_rate": 1.1297056822349083e-06, "loss": 0.8418, "step": 3907 }, { "epoch": 0.7883825536569298, "grad_norm": 0.33156833052635193, "learning_rate": 1.1276376664591315e-06, "loss": 0.6247, "step": 3908 }, { "epoch": 0.7885842892131367, "grad_norm": 0.522953450679779, "learning_rate": 1.125571304652766e-06, "loss": 0.8095, "step": 3909 }, { "epoch": 0.7887860247693438, "grad_norm": 0.32513847947120667, "learning_rate": 1.1235065976983944e-06, "loss": 0.6806, "step": 3910 }, { "epoch": 0.7889877603255507, "grad_norm": 0.586698055267334, "learning_rate": 1.1214435464779006e-06, "loss": 0.6676, "step": 3911 }, { "epoch": 0.7891894958817578, "grad_norm": 0.805479884147644, "learning_rate": 1.1193821518724602e-06, "loss": 0.7554, "step": 3912 }, { "epoch": 0.7893912314379647, "grad_norm": 0.4024786949157715, "learning_rate": 1.1173224147625339e-06, "loss": 0.6999, "step": 3913 }, { "epoch": 0.7895929669941717, "grad_norm": 1.362681269645691, "learning_rate": 1.1152643360278847e-06, "loss": 0.6082, "step": 3914 }, { "epoch": 0.7897947025503788, "grad_norm": 0.38716137409210205, "learning_rate": 1.1132079165475601e-06, "loss": 0.8036, "step": 3915 }, { "epoch": 0.7899964381065857, "grad_norm": 0.44183582067489624, "learning_rate": 1.1111531571999e-06, "loss": 0.8228, "step": 3916 }, { "epoch": 0.7901981736627928, "grad_norm": 0.6368033289909363, "learning_rate": 1.1091000588625395e-06, "loss": 0.6956, "step": 3917 }, { "epoch": 0.7903999092189997, "grad_norm": 0.5896937847137451, "learning_rate": 1.1070486224124e-06, "loss": 0.8112, "step": 3918 }, { "epoch": 0.7906016447752067, "grad_norm": 0.4653731882572174, "learning_rate": 1.104998848725692e-06, "loss": 0.7249, "step": 3919 }, { "epoch": 0.7908033803314137, "grad_norm": 0.6194247007369995, "learning_rate": 1.1029507386779225e-06, "loss": 0.6755, "step": 3920 }, { "epoch": 0.7910051158876207, "grad_norm": 0.34444916248321533, "learning_rate": 1.1009042931438784e-06, "loss": 0.6422, "step": 3921 }, { "epoch": 0.7912068514438276, "grad_norm": 0.45076173543930054, "learning_rate": 1.0988595129976444e-06, "loss": 0.6559, "step": 3922 }, { "epoch": 0.7914085870000347, "grad_norm": 0.41652804613113403, "learning_rate": 1.096816399112589e-06, "loss": 0.6954, "step": 3923 }, { "epoch": 0.7916103225562416, "grad_norm": 0.3207223117351532, "learning_rate": 1.0947749523613683e-06, "loss": 0.7796, "step": 3924 }, { "epoch": 0.7918120581124487, "grad_norm": 0.3924807608127594, "learning_rate": 1.0927351736159314e-06, "loss": 0.6771, "step": 3925 }, { "epoch": 0.7920137936686557, "grad_norm": 1.2680659294128418, "learning_rate": 1.09069706374751e-06, "loss": 0.7168, "step": 3926 }, { "epoch": 0.7922155292248626, "grad_norm": 1.0150847434997559, "learning_rate": 1.088660623626624e-06, "loss": 0.7073, "step": 3927 }, { "epoch": 0.7924172647810697, "grad_norm": 0.3826291859149933, "learning_rate": 1.0866258541230835e-06, "loss": 0.7506, "step": 3928 }, { "epoch": 0.7926190003372766, "grad_norm": 0.8095358610153198, "learning_rate": 1.0845927561059805e-06, "loss": 0.6207, "step": 3929 }, { "epoch": 0.7928207358934837, "grad_norm": 0.41174301505088806, "learning_rate": 1.0825613304436938e-06, "loss": 0.6418, "step": 3930 }, { "epoch": 0.7930224714496906, "grad_norm": 0.4840712547302246, "learning_rate": 1.0805315780038922e-06, "loss": 0.684, "step": 3931 }, { "epoch": 0.7932242070058976, "grad_norm": 0.5175725817680359, "learning_rate": 1.078503499653525e-06, "loss": 0.6481, "step": 3932 }, { "epoch": 0.7934259425621046, "grad_norm": 0.424094557762146, "learning_rate": 1.0764770962588278e-06, "loss": 0.7043, "step": 3933 }, { "epoch": 0.7936276781183116, "grad_norm": 0.40806031227111816, "learning_rate": 1.074452368685322e-06, "loss": 0.6706, "step": 3934 }, { "epoch": 0.7938294136745186, "grad_norm": 0.8501918911933899, "learning_rate": 1.0724293177978106e-06, "loss": 0.8923, "step": 3935 }, { "epoch": 0.7940311492307256, "grad_norm": 1.5072156190872192, "learning_rate": 1.0704079444603855e-06, "loss": 0.7205, "step": 3936 }, { "epoch": 0.7942328847869325, "grad_norm": 0.581558346748352, "learning_rate": 1.0683882495364163e-06, "loss": 0.8027, "step": 3937 }, { "epoch": 0.7944346203431396, "grad_norm": 0.35465800762176514, "learning_rate": 1.0663702338885579e-06, "loss": 0.8202, "step": 3938 }, { "epoch": 0.7946363558993466, "grad_norm": 0.8600527048110962, "learning_rate": 1.0643538983787505e-06, "loss": 0.6867, "step": 3939 }, { "epoch": 0.7948380914555535, "grad_norm": 0.49030637741088867, "learning_rate": 1.062339243868213e-06, "loss": 0.6903, "step": 3940 }, { "epoch": 0.7950398270117606, "grad_norm": 0.5355114340782166, "learning_rate": 1.0603262712174477e-06, "loss": 0.9598, "step": 3941 }, { "epoch": 0.7952415625679675, "grad_norm": 0.5693348050117493, "learning_rate": 1.0583149812862382e-06, "loss": 0.6499, "step": 3942 }, { "epoch": 0.7954432981241746, "grad_norm": 0.3693627417087555, "learning_rate": 1.0563053749336516e-06, "loss": 0.8187, "step": 3943 }, { "epoch": 0.7956450336803815, "grad_norm": 0.31983208656311035, "learning_rate": 1.0542974530180327e-06, "loss": 0.6465, "step": 3944 }, { "epoch": 0.7958467692365885, "grad_norm": 0.9442469477653503, "learning_rate": 1.0522912163970073e-06, "loss": 0.6709, "step": 3945 }, { "epoch": 0.7960485047927955, "grad_norm": 0.4609791934490204, "learning_rate": 1.0502866659274847e-06, "loss": 0.6847, "step": 3946 }, { "epoch": 0.7962502403490025, "grad_norm": 1.130595326423645, "learning_rate": 1.0482838024656505e-06, "loss": 0.6629, "step": 3947 }, { "epoch": 0.7964519759052096, "grad_norm": 0.37832605838775635, "learning_rate": 1.0462826268669707e-06, "loss": 0.7392, "step": 3948 }, { "epoch": 0.7966537114614165, "grad_norm": 0.35009559988975525, "learning_rate": 1.0442831399861903e-06, "loss": 0.6671, "step": 3949 }, { "epoch": 0.7968554470176235, "grad_norm": 0.526996910572052, "learning_rate": 1.0422853426773322e-06, "loss": 0.629, "step": 3950 }, { "epoch": 0.7970571825738305, "grad_norm": 0.4254051744937897, "learning_rate": 1.040289235793701e-06, "loss": 0.7922, "step": 3951 }, { "epoch": 0.7972589181300375, "grad_norm": 0.6463653445243835, "learning_rate": 1.0382948201878767e-06, "loss": 0.8352, "step": 3952 }, { "epoch": 0.7974606536862445, "grad_norm": 0.4028523862361908, "learning_rate": 1.0363020967117143e-06, "loss": 0.6736, "step": 3953 }, { "epoch": 0.7976623892424515, "grad_norm": 0.6316519379615784, "learning_rate": 1.034311066216353e-06, "loss": 0.6796, "step": 3954 }, { "epoch": 0.7978641247986584, "grad_norm": 0.37582528591156006, "learning_rate": 1.0323217295522026e-06, "loss": 0.6582, "step": 3955 }, { "epoch": 0.7980658603548655, "grad_norm": 0.8689025640487671, "learning_rate": 1.0303340875689505e-06, "loss": 0.6513, "step": 3956 }, { "epoch": 0.7982675959110724, "grad_norm": 0.45216962695121765, "learning_rate": 1.028348141115565e-06, "loss": 0.6753, "step": 3957 }, { "epoch": 0.7984693314672794, "grad_norm": 0.5104213356971741, "learning_rate": 1.0263638910402834e-06, "loss": 0.7656, "step": 3958 }, { "epoch": 0.7986710670234864, "grad_norm": 0.35348913073539734, "learning_rate": 1.024381338190622e-06, "loss": 0.6837, "step": 3959 }, { "epoch": 0.7988728025796934, "grad_norm": 0.42005017399787903, "learning_rate": 1.0224004834133755e-06, "loss": 0.6548, "step": 3960 }, { "epoch": 0.7990745381359005, "grad_norm": 0.5248255133628845, "learning_rate": 1.0204213275546037e-06, "loss": 0.6795, "step": 3961 }, { "epoch": 0.7992762736921074, "grad_norm": 0.6113191843032837, "learning_rate": 1.0184438714596518e-06, "loss": 0.8211, "step": 3962 }, { "epoch": 0.7994780092483144, "grad_norm": 0.480577290058136, "learning_rate": 1.0164681159731316e-06, "loss": 0.6789, "step": 3963 }, { "epoch": 0.7996797448045214, "grad_norm": 0.5853450894355774, "learning_rate": 1.0144940619389298e-06, "loss": 0.668, "step": 3964 }, { "epoch": 0.7998814803607284, "grad_norm": 0.8544267416000366, "learning_rate": 1.01252171020021e-06, "loss": 0.718, "step": 3965 }, { "epoch": 0.8000832159169354, "grad_norm": 0.4894731938838959, "learning_rate": 1.0105510615994051e-06, "loss": 0.6801, "step": 3966 }, { "epoch": 0.8002849514731424, "grad_norm": 0.5039870738983154, "learning_rate": 1.00858211697822e-06, "loss": 0.6748, "step": 3967 }, { "epoch": 0.8004866870293493, "grad_norm": 0.6922500729560852, "learning_rate": 1.006614877177638e-06, "loss": 0.655, "step": 3968 }, { "epoch": 0.8006884225855564, "grad_norm": 2.2016608715057373, "learning_rate": 1.0046493430379029e-06, "loss": 0.852, "step": 3969 }, { "epoch": 0.8008901581417633, "grad_norm": 0.44182875752449036, "learning_rate": 1.0026855153985409e-06, "loss": 0.8245, "step": 3970 }, { "epoch": 0.8010918936979704, "grad_norm": 0.35396745800971985, "learning_rate": 1.000723395098347e-06, "loss": 0.6556, "step": 3971 }, { "epoch": 0.8012936292541774, "grad_norm": 0.33134302496910095, "learning_rate": 9.987629829753799e-07, "loss": 0.6738, "step": 3972 }, { "epoch": 0.8014953648103843, "grad_norm": 0.5623152852058411, "learning_rate": 9.968042798669775e-07, "loss": 0.6553, "step": 3973 }, { "epoch": 0.8016971003665914, "grad_norm": 0.7911794781684875, "learning_rate": 9.94847286609743e-07, "loss": 0.6228, "step": 3974 }, { "epoch": 0.8018988359227983, "grad_norm": 0.39568525552749634, "learning_rate": 9.928920040395495e-07, "loss": 0.7458, "step": 3975 }, { "epoch": 0.8021005714790053, "grad_norm": 0.3557928800582886, "learning_rate": 9.90938432991544e-07, "loss": 0.7113, "step": 3976 }, { "epoch": 0.8023023070352123, "grad_norm": 0.8656487464904785, "learning_rate": 9.889865743001332e-07, "loss": 0.6891, "step": 3977 }, { "epoch": 0.8025040425914193, "grad_norm": 0.665074348449707, "learning_rate": 9.87036428799001e-07, "loss": 0.6932, "step": 3978 }, { "epoch": 0.8027057781476263, "grad_norm": 0.6446415185928345, "learning_rate": 9.850879973210993e-07, "loss": 0.66, "step": 3979 }, { "epoch": 0.8029075137038333, "grad_norm": 0.7038710117340088, "learning_rate": 9.831412806986395e-07, "loss": 0.6797, "step": 3980 }, { "epoch": 0.8031092492600402, "grad_norm": 0.48777586221694946, "learning_rate": 9.811962797631102e-07, "loss": 0.6634, "step": 3981 }, { "epoch": 0.8033109848162473, "grad_norm": 0.33813929557800293, "learning_rate": 9.792529953452622e-07, "loss": 0.8279, "step": 3982 }, { "epoch": 0.8035127203724542, "grad_norm": 0.9014752507209778, "learning_rate": 9.773114282751134e-07, "loss": 0.7631, "step": 3983 }, { "epoch": 0.8037144559286613, "grad_norm": 0.8523035645484924, "learning_rate": 9.753715793819502e-07, "loss": 0.7108, "step": 3984 }, { "epoch": 0.8039161914848683, "grad_norm": 0.7649599313735962, "learning_rate": 9.734334494943237e-07, "loss": 0.638, "step": 3985 }, { "epoch": 0.8041179270410752, "grad_norm": 0.41619718074798584, "learning_rate": 9.714970394400492e-07, "loss": 0.7235, "step": 3986 }, { "epoch": 0.8043196625972823, "grad_norm": 0.35346657037734985, "learning_rate": 9.695623500462114e-07, "loss": 0.6938, "step": 3987 }, { "epoch": 0.8045213981534892, "grad_norm": 0.5827614068984985, "learning_rate": 9.676293821391568e-07, "loss": 0.6514, "step": 3988 }, { "epoch": 0.8047231337096963, "grad_norm": 0.9623099565505981, "learning_rate": 9.656981365444983e-07, "loss": 0.7336, "step": 3989 }, { "epoch": 0.8049248692659032, "grad_norm": 0.3670915365219116, "learning_rate": 9.637686140871121e-07, "loss": 0.6056, "step": 3990 }, { "epoch": 0.8051266048221102, "grad_norm": 0.7231318950653076, "learning_rate": 9.618408155911369e-07, "loss": 0.7102, "step": 3991 }, { "epoch": 0.8053283403783172, "grad_norm": 0.9028728008270264, "learning_rate": 9.599147418799803e-07, "loss": 0.8215, "step": 3992 }, { "epoch": 0.8055300759345242, "grad_norm": 0.6327657103538513, "learning_rate": 9.579903937763086e-07, "loss": 0.6552, "step": 3993 }, { "epoch": 0.8057318114907311, "grad_norm": 0.7391289472579956, "learning_rate": 9.560677721020506e-07, "loss": 0.6449, "step": 3994 }, { "epoch": 0.8059335470469382, "grad_norm": 0.49933743476867676, "learning_rate": 9.541468776784025e-07, "loss": 0.6742, "step": 3995 }, { "epoch": 0.8061352826031452, "grad_norm": 0.9601204991340637, "learning_rate": 9.522277113258177e-07, "loss": 0.6404, "step": 3996 }, { "epoch": 0.8063370181593522, "grad_norm": 0.4408121109008789, "learning_rate": 9.503102738640146e-07, "loss": 0.6537, "step": 3997 }, { "epoch": 0.8065387537155592, "grad_norm": 0.35162702202796936, "learning_rate": 9.483945661119698e-07, "loss": 0.7622, "step": 3998 }, { "epoch": 0.8067404892717661, "grad_norm": 0.3745332360267639, "learning_rate": 9.464805888879264e-07, "loss": 0.6869, "step": 3999 }, { "epoch": 0.8069422248279732, "grad_norm": 0.36443084478378296, "learning_rate": 9.445683430093843e-07, "loss": 0.8024, "step": 4000 }, { "epoch": 0.8071439603841801, "grad_norm": 0.8786391019821167, "learning_rate": 9.426578292931033e-07, "loss": 0.6622, "step": 4001 }, { "epoch": 0.8073456959403872, "grad_norm": 0.3806266188621521, "learning_rate": 9.407490485551068e-07, "loss": 0.6544, "step": 4002 }, { "epoch": 0.8075474314965941, "grad_norm": 0.3530179262161255, "learning_rate": 9.388420016106764e-07, "loss": 0.7667, "step": 4003 }, { "epoch": 0.8077491670528011, "grad_norm": 0.5868592858314514, "learning_rate": 9.36936689274352e-07, "loss": 0.6702, "step": 4004 }, { "epoch": 0.8079509026090081, "grad_norm": 0.31058308482170105, "learning_rate": 9.350331123599327e-07, "loss": 0.9213, "step": 4005 }, { "epoch": 0.8081526381652151, "grad_norm": 0.4726583659648895, "learning_rate": 9.331312716804791e-07, "loss": 0.6907, "step": 4006 }, { "epoch": 0.8083543737214222, "grad_norm": 0.3896176815032959, "learning_rate": 9.312311680483083e-07, "loss": 0.8602, "step": 4007 }, { "epoch": 0.8085561092776291, "grad_norm": 0.5245814919471741, "learning_rate": 9.293328022749942e-07, "loss": 0.6245, "step": 4008 }, { "epoch": 0.8087578448338361, "grad_norm": 0.42944633960723877, "learning_rate": 9.27436175171369e-07, "loss": 0.8182, "step": 4009 }, { "epoch": 0.8089595803900431, "grad_norm": 1.1788564920425415, "learning_rate": 9.255412875475256e-07, "loss": 0.6822, "step": 4010 }, { "epoch": 0.8091613159462501, "grad_norm": 0.9582619071006775, "learning_rate": 9.2364814021281e-07, "loss": 0.8864, "step": 4011 }, { "epoch": 0.8093630515024571, "grad_norm": 0.6898618936538696, "learning_rate": 9.217567339758254e-07, "loss": 0.6338, "step": 4012 }, { "epoch": 0.8095647870586641, "grad_norm": 0.6961838006973267, "learning_rate": 9.198670696444339e-07, "loss": 0.6896, "step": 4013 }, { "epoch": 0.809766522614871, "grad_norm": 0.3257087767124176, "learning_rate": 9.179791480257511e-07, "loss": 0.7154, "step": 4014 }, { "epoch": 0.8099682581710781, "grad_norm": 0.4184440076351166, "learning_rate": 9.160929699261479e-07, "loss": 0.6807, "step": 4015 }, { "epoch": 0.810169993727285, "grad_norm": 0.8447985649108887, "learning_rate": 9.142085361512548e-07, "loss": 0.6644, "step": 4016 }, { "epoch": 0.810371729283492, "grad_norm": 0.49792590737342834, "learning_rate": 9.123258475059493e-07, "loss": 0.6581, "step": 4017 }, { "epoch": 0.810573464839699, "grad_norm": 0.32772713899612427, "learning_rate": 9.104449047943725e-07, "loss": 0.6658, "step": 4018 }, { "epoch": 0.810775200395906, "grad_norm": 0.5296732187271118, "learning_rate": 9.08565708819914e-07, "loss": 0.6357, "step": 4019 }, { "epoch": 0.8109769359521131, "grad_norm": 0.5747442245483398, "learning_rate": 9.066882603852173e-07, "loss": 0.6868, "step": 4020 }, { "epoch": 0.81117867150832, "grad_norm": 0.36947518587112427, "learning_rate": 9.048125602921843e-07, "loss": 0.641, "step": 4021 }, { "epoch": 0.811380407064527, "grad_norm": 0.3003344237804413, "learning_rate": 9.029386093419651e-07, "loss": 0.6279, "step": 4022 }, { "epoch": 0.811582142620734, "grad_norm": 0.6348669528961182, "learning_rate": 9.010664083349635e-07, "loss": 0.6277, "step": 4023 }, { "epoch": 0.811783878176941, "grad_norm": 0.4441215991973877, "learning_rate": 8.991959580708409e-07, "loss": 0.7389, "step": 4024 }, { "epoch": 0.811985613733148, "grad_norm": 0.5426074266433716, "learning_rate": 8.973272593485011e-07, "loss": 0.6705, "step": 4025 }, { "epoch": 0.812187349289355, "grad_norm": 0.9262121319770813, "learning_rate": 8.954603129661088e-07, "loss": 0.656, "step": 4026 }, { "epoch": 0.8123890848455619, "grad_norm": 0.46151402592658997, "learning_rate": 8.935951197210796e-07, "loss": 0.7806, "step": 4027 }, { "epoch": 0.812590820401769, "grad_norm": 0.9116199612617493, "learning_rate": 8.917316804100723e-07, "loss": 0.7286, "step": 4028 }, { "epoch": 0.812792555957976, "grad_norm": 0.5533108711242676, "learning_rate": 8.898699958290063e-07, "loss": 0.6867, "step": 4029 }, { "epoch": 0.812994291514183, "grad_norm": 0.6026671528816223, "learning_rate": 8.880100667730457e-07, "loss": 0.6471, "step": 4030 }, { "epoch": 0.81319602707039, "grad_norm": 0.35685989260673523, "learning_rate": 8.861518940366043e-07, "loss": 0.7645, "step": 4031 }, { "epoch": 0.8133977626265969, "grad_norm": 0.42927679419517517, "learning_rate": 8.842954784133517e-07, "loss": 0.8251, "step": 4032 }, { "epoch": 0.813599498182804, "grad_norm": 0.5092020034790039, "learning_rate": 8.824408206962004e-07, "loss": 0.668, "step": 4033 }, { "epoch": 0.8138012337390109, "grad_norm": 0.6452570557594299, "learning_rate": 8.805879216773139e-07, "loss": 0.6546, "step": 4034 }, { "epoch": 0.8140029692952179, "grad_norm": 0.4207552373409271, "learning_rate": 8.787367821481096e-07, "loss": 0.7028, "step": 4035 }, { "epoch": 0.8142047048514249, "grad_norm": 0.3666224479675293, "learning_rate": 8.768874028992431e-07, "loss": 0.6957, "step": 4036 }, { "epoch": 0.8144064404076319, "grad_norm": 0.544642448425293, "learning_rate": 8.750397847206288e-07, "loss": 0.666, "step": 4037 }, { "epoch": 0.8146081759638389, "grad_norm": 0.3639770448207855, "learning_rate": 8.731939284014223e-07, "loss": 0.6683, "step": 4038 }, { "epoch": 0.8148099115200459, "grad_norm": 0.5589380264282227, "learning_rate": 8.713498347300281e-07, "loss": 0.8243, "step": 4039 }, { "epoch": 0.8150116470762528, "grad_norm": 0.3419032692909241, "learning_rate": 8.695075044940998e-07, "loss": 0.6522, "step": 4040 }, { "epoch": 0.8152133826324599, "grad_norm": 0.9368754625320435, "learning_rate": 8.676669384805359e-07, "loss": 0.8444, "step": 4041 }, { "epoch": 0.8154151181886669, "grad_norm": 0.3633244037628174, "learning_rate": 8.658281374754807e-07, "loss": 0.6971, "step": 4042 }, { "epoch": 0.8156168537448739, "grad_norm": 0.40811094641685486, "learning_rate": 8.639911022643288e-07, "loss": 0.679, "step": 4043 }, { "epoch": 0.8158185893010809, "grad_norm": 0.5470287799835205, "learning_rate": 8.621558336317132e-07, "loss": 0.7524, "step": 4044 }, { "epoch": 0.8160203248572878, "grad_norm": 0.4651123583316803, "learning_rate": 8.60322332361519e-07, "loss": 0.7018, "step": 4045 }, { "epoch": 0.8162220604134949, "grad_norm": 0.3326999843120575, "learning_rate": 8.584905992368764e-07, "loss": 0.6496, "step": 4046 }, { "epoch": 0.8164237959697018, "grad_norm": 1.3912816047668457, "learning_rate": 8.56660635040153e-07, "loss": 0.8146, "step": 4047 }, { "epoch": 0.8166255315259089, "grad_norm": 0.3894466459751129, "learning_rate": 8.548324405529696e-07, "loss": 0.7605, "step": 4048 }, { "epoch": 0.8168272670821158, "grad_norm": 0.5294163227081299, "learning_rate": 8.530060165561871e-07, "loss": 0.7352, "step": 4049 }, { "epoch": 0.8170290026383228, "grad_norm": 0.4345068335533142, "learning_rate": 8.511813638299082e-07, "loss": 0.7604, "step": 4050 }, { "epoch": 0.8172307381945298, "grad_norm": 0.7005764842033386, "learning_rate": 8.493584831534845e-07, "loss": 0.7812, "step": 4051 }, { "epoch": 0.8174324737507368, "grad_norm": 0.44860848784446716, "learning_rate": 8.475373753055067e-07, "loss": 0.6624, "step": 4052 }, { "epoch": 0.8176342093069437, "grad_norm": 0.9805493354797363, "learning_rate": 8.457180410638072e-07, "loss": 0.709, "step": 4053 }, { "epoch": 0.8178359448631508, "grad_norm": 0.35368284583091736, "learning_rate": 8.439004812054658e-07, "loss": 0.7585, "step": 4054 }, { "epoch": 0.8180376804193578, "grad_norm": 1.1566565036773682, "learning_rate": 8.420846965068003e-07, "loss": 0.762, "step": 4055 }, { "epoch": 0.8182394159755648, "grad_norm": 0.9465445280075073, "learning_rate": 8.402706877433708e-07, "loss": 0.7044, "step": 4056 }, { "epoch": 0.8184411515317718, "grad_norm": 0.5295639038085938, "learning_rate": 8.384584556899805e-07, "loss": 0.6468, "step": 4057 }, { "epoch": 0.8186428870879787, "grad_norm": 0.4885746240615845, "learning_rate": 8.366480011206707e-07, "loss": 0.8564, "step": 4058 }, { "epoch": 0.8188446226441858, "grad_norm": 0.4677394926548004, "learning_rate": 8.348393248087289e-07, "loss": 0.7055, "step": 4059 }, { "epoch": 0.8190463582003927, "grad_norm": 0.5112569332122803, "learning_rate": 8.330324275266777e-07, "loss": 0.8003, "step": 4060 }, { "epoch": 0.8192480937565998, "grad_norm": 0.7537760138511658, "learning_rate": 8.312273100462809e-07, "loss": 0.6848, "step": 4061 }, { "epoch": 0.8194498293128067, "grad_norm": 0.3288614749908447, "learning_rate": 8.294239731385456e-07, "loss": 1.0145, "step": 4062 }, { "epoch": 0.8196515648690137, "grad_norm": 0.6865113377571106, "learning_rate": 8.276224175737152e-07, "loss": 0.7945, "step": 4063 }, { "epoch": 0.8198533004252208, "grad_norm": 0.46431106328964233, "learning_rate": 8.258226441212719e-07, "loss": 0.6765, "step": 4064 }, { "epoch": 0.8200550359814277, "grad_norm": 0.6039952635765076, "learning_rate": 8.240246535499369e-07, "loss": 0.6528, "step": 4065 }, { "epoch": 0.8202567715376348, "grad_norm": 0.473136842250824, "learning_rate": 8.222284466276731e-07, "loss": 0.6434, "step": 4066 }, { "epoch": 0.8204585070938417, "grad_norm": 0.46378180384635925, "learning_rate": 8.20434024121678e-07, "loss": 0.7357, "step": 4067 }, { "epoch": 0.8206602426500487, "grad_norm": 0.5360262989997864, "learning_rate": 8.186413867983872e-07, "loss": 0.6118, "step": 4068 }, { "epoch": 0.8208619782062557, "grad_norm": 0.32590314745903015, "learning_rate": 8.168505354234774e-07, "loss": 0.6723, "step": 4069 }, { "epoch": 0.8210637137624627, "grad_norm": 0.364083468914032, "learning_rate": 8.150614707618576e-07, "loss": 0.8355, "step": 4070 }, { "epoch": 0.8212654493186696, "grad_norm": 0.5447162985801697, "learning_rate": 8.132741935776767e-07, "loss": 0.8829, "step": 4071 }, { "epoch": 0.8214671848748767, "grad_norm": 0.9954445362091064, "learning_rate": 8.114887046343184e-07, "loss": 0.6878, "step": 4072 }, { "epoch": 0.8216689204310836, "grad_norm": 0.7198444604873657, "learning_rate": 8.097050046944039e-07, "loss": 0.6585, "step": 4073 }, { "epoch": 0.8218706559872907, "grad_norm": 1.0638344287872314, "learning_rate": 8.079230945197908e-07, "loss": 0.7947, "step": 4074 }, { "epoch": 0.8220723915434976, "grad_norm": 1.1641714572906494, "learning_rate": 8.061429748715705e-07, "loss": 0.6389, "step": 4075 }, { "epoch": 0.8222741270997046, "grad_norm": 0.6671349406242371, "learning_rate": 8.043646465100696e-07, "loss": 0.6476, "step": 4076 }, { "epoch": 0.8224758626559117, "grad_norm": 0.4042210280895233, "learning_rate": 8.02588110194853e-07, "loss": 0.6908, "step": 4077 }, { "epoch": 0.8226775982121186, "grad_norm": 0.49477052688598633, "learning_rate": 8.008133666847156e-07, "loss": 0.7094, "step": 4078 }, { "epoch": 0.8228793337683257, "grad_norm": 0.5312076807022095, "learning_rate": 7.990404167376886e-07, "loss": 0.7471, "step": 4079 }, { "epoch": 0.8230810693245326, "grad_norm": 0.4298705756664276, "learning_rate": 7.972692611110384e-07, "loss": 0.716, "step": 4080 }, { "epoch": 0.8232828048807396, "grad_norm": 0.5681132674217224, "learning_rate": 7.954999005612629e-07, "loss": 0.6814, "step": 4081 }, { "epoch": 0.8234845404369466, "grad_norm": 0.49187588691711426, "learning_rate": 7.937323358440935e-07, "loss": 0.674, "step": 4082 }, { "epoch": 0.8236862759931536, "grad_norm": 0.4805457293987274, "learning_rate": 7.919665677144983e-07, "loss": 0.6772, "step": 4083 }, { "epoch": 0.8238880115493606, "grad_norm": 0.41921690106391907, "learning_rate": 7.902025969266702e-07, "loss": 0.6761, "step": 4084 }, { "epoch": 0.8240897471055676, "grad_norm": 0.5359138250350952, "learning_rate": 7.884404242340421e-07, "loss": 0.6255, "step": 4085 }, { "epoch": 0.8242914826617745, "grad_norm": 0.39744481444358826, "learning_rate": 7.866800503892758e-07, "loss": 0.7994, "step": 4086 }, { "epoch": 0.8244932182179816, "grad_norm": 0.47074541449546814, "learning_rate": 7.849214761442637e-07, "loss": 0.6598, "step": 4087 }, { "epoch": 0.8246949537741886, "grad_norm": 0.4479901194572449, "learning_rate": 7.83164702250132e-07, "loss": 0.6677, "step": 4088 }, { "epoch": 0.8248966893303955, "grad_norm": 3.9795687198638916, "learning_rate": 7.814097294572365e-07, "loss": 0.6239, "step": 4089 }, { "epoch": 0.8250984248866026, "grad_norm": 0.34133467078208923, "learning_rate": 7.796565585151621e-07, "loss": 0.7294, "step": 4090 }, { "epoch": 0.8253001604428095, "grad_norm": 0.5068195462226868, "learning_rate": 7.779051901727297e-07, "loss": 0.6838, "step": 4091 }, { "epoch": 0.8255018959990166, "grad_norm": 6.181421279907227, "learning_rate": 7.761556251779823e-07, "loss": 0.6886, "step": 4092 }, { "epoch": 0.8257036315552235, "grad_norm": 0.48550984263420105, "learning_rate": 7.744078642781982e-07, "loss": 0.6524, "step": 4093 }, { "epoch": 0.8259053671114305, "grad_norm": 0.8800036311149597, "learning_rate": 7.726619082198871e-07, "loss": 0.6432, "step": 4094 }, { "epoch": 0.8261071026676375, "grad_norm": 0.6388117074966431, "learning_rate": 7.709177577487786e-07, "loss": 0.6767, "step": 4095 }, { "epoch": 0.8263088382238445, "grad_norm": 0.3175949454307556, "learning_rate": 7.691754136098417e-07, "loss": 0.692, "step": 4096 }, { "epoch": 0.8265105737800515, "grad_norm": 0.7242339253425598, "learning_rate": 7.674348765472672e-07, "loss": 0.6378, "step": 4097 }, { "epoch": 0.8267123093362585, "grad_norm": 0.3929362893104553, "learning_rate": 7.656961473044744e-07, "loss": 0.675, "step": 4098 }, { "epoch": 0.8269140448924654, "grad_norm": 1.0461013317108154, "learning_rate": 7.63959226624117e-07, "loss": 0.675, "step": 4099 }, { "epoch": 0.8271157804486725, "grad_norm": 0.5271601676940918, "learning_rate": 7.622241152480652e-07, "loss": 0.6348, "step": 4100 }, { "epoch": 0.8273175160048795, "grad_norm": 1.4197477102279663, "learning_rate": 7.604908139174255e-07, "loss": 0.7237, "step": 4101 }, { "epoch": 0.8275192515610865, "grad_norm": 0.4259219765663147, "learning_rate": 7.587593233725305e-07, "loss": 0.6484, "step": 4102 }, { "epoch": 0.8277209871172935, "grad_norm": 0.34800437092781067, "learning_rate": 7.570296443529318e-07, "loss": 0.6178, "step": 4103 }, { "epoch": 0.8279227226735004, "grad_norm": 0.5846043825149536, "learning_rate": 7.55301777597417e-07, "loss": 0.6755, "step": 4104 }, { "epoch": 0.8281244582297075, "grad_norm": 0.34522733092308044, "learning_rate": 7.535757238439939e-07, "loss": 0.6354, "step": 4105 }, { "epoch": 0.8283261937859144, "grad_norm": 0.5267305374145508, "learning_rate": 7.518514838298957e-07, "loss": 0.741, "step": 4106 }, { "epoch": 0.8285279293421214, "grad_norm": 0.5945020318031311, "learning_rate": 7.501290582915849e-07, "loss": 0.7373, "step": 4107 }, { "epoch": 0.8287296648983284, "grad_norm": 0.4800272583961487, "learning_rate": 7.484084479647458e-07, "loss": 0.6785, "step": 4108 }, { "epoch": 0.8289314004545354, "grad_norm": 0.37526294589042664, "learning_rate": 7.466896535842865e-07, "loss": 0.6325, "step": 4109 }, { "epoch": 0.8291331360107425, "grad_norm": 0.7243527770042419, "learning_rate": 7.449726758843434e-07, "loss": 0.6231, "step": 4110 }, { "epoch": 0.8293348715669494, "grad_norm": 0.4649500250816345, "learning_rate": 7.432575155982741e-07, "loss": 0.7683, "step": 4111 }, { "epoch": 0.8295366071231564, "grad_norm": 0.532231330871582, "learning_rate": 7.415441734586604e-07, "loss": 0.6805, "step": 4112 }, { "epoch": 0.8297383426793634, "grad_norm": 0.7913029193878174, "learning_rate": 7.398326501973069e-07, "loss": 0.6648, "step": 4113 }, { "epoch": 0.8299400782355704, "grad_norm": 0.5430890917778015, "learning_rate": 7.381229465452417e-07, "loss": 0.6817, "step": 4114 }, { "epoch": 0.8301418137917774, "grad_norm": 0.3268798291683197, "learning_rate": 7.364150632327182e-07, "loss": 0.6853, "step": 4115 }, { "epoch": 0.8303435493479844, "grad_norm": 0.39360764622688293, "learning_rate": 7.347090009892089e-07, "loss": 0.6283, "step": 4116 }, { "epoch": 0.8305452849041913, "grad_norm": 0.44346925616264343, "learning_rate": 7.330047605434087e-07, "loss": 0.6706, "step": 4117 }, { "epoch": 0.8307470204603984, "grad_norm": 0.6690890789031982, "learning_rate": 7.313023426232374e-07, "loss": 0.6884, "step": 4118 }, { "epoch": 0.8309487560166053, "grad_norm": 0.6090205311775208, "learning_rate": 7.296017479558338e-07, "loss": 0.8355, "step": 4119 }, { "epoch": 0.8311504915728124, "grad_norm": 0.8864596486091614, "learning_rate": 7.279029772675572e-07, "loss": 0.6982, "step": 4120 }, { "epoch": 0.8313522271290193, "grad_norm": 0.3951992690563202, "learning_rate": 7.262060312839908e-07, "loss": 0.7975, "step": 4121 }, { "epoch": 0.8315539626852263, "grad_norm": 0.4038160443305969, "learning_rate": 7.24510910729937e-07, "loss": 0.6582, "step": 4122 }, { "epoch": 0.8317556982414334, "grad_norm": 0.36711713671684265, "learning_rate": 7.228176163294171e-07, "loss": 0.6611, "step": 4123 }, { "epoch": 0.8319574337976403, "grad_norm": 0.3582766056060791, "learning_rate": 7.211261488056731e-07, "loss": 0.7128, "step": 4124 }, { "epoch": 0.8321591693538474, "grad_norm": 0.40361008048057556, "learning_rate": 7.194365088811689e-07, "loss": 0.6902, "step": 4125 }, { "epoch": 0.8323609049100543, "grad_norm": 0.38358166813850403, "learning_rate": 7.17748697277586e-07, "loss": 0.7614, "step": 4126 }, { "epoch": 0.8325626404662613, "grad_norm": 0.33210864663124084, "learning_rate": 7.160627147158244e-07, "loss": 0.8363, "step": 4127 }, { "epoch": 0.8327643760224683, "grad_norm": 1.1483681201934814, "learning_rate": 7.143785619160026e-07, "loss": 0.7553, "step": 4128 }, { "epoch": 0.8329661115786753, "grad_norm": 0.6336138844490051, "learning_rate": 7.126962395974607e-07, "loss": 0.7545, "step": 4129 }, { "epoch": 0.8331678471348822, "grad_norm": 0.38122066855430603, "learning_rate": 7.110157484787538e-07, "loss": 0.7034, "step": 4130 }, { "epoch": 0.8333695826910893, "grad_norm": 1.3744087219238281, "learning_rate": 7.093370892776558e-07, "loss": 0.7234, "step": 4131 }, { "epoch": 0.8335713182472962, "grad_norm": 0.5250982642173767, "learning_rate": 7.076602627111573e-07, "loss": 0.6685, "step": 4132 }, { "epoch": 0.8337730538035033, "grad_norm": 0.4979982078075409, "learning_rate": 7.059852694954694e-07, "loss": 0.6442, "step": 4133 }, { "epoch": 0.8339747893597103, "grad_norm": 0.41418156027793884, "learning_rate": 7.04312110346016e-07, "loss": 0.7095, "step": 4134 }, { "epoch": 0.8341765249159172, "grad_norm": 0.4083440601825714, "learning_rate": 7.026407859774393e-07, "loss": 0.7809, "step": 4135 }, { "epoch": 0.8343782604721243, "grad_norm": 0.5151472091674805, "learning_rate": 7.009712971035998e-07, "loss": 0.6721, "step": 4136 }, { "epoch": 0.8345799960283312, "grad_norm": 0.9469529390335083, "learning_rate": 6.993036444375706e-07, "loss": 0.6296, "step": 4137 }, { "epoch": 0.8347817315845383, "grad_norm": 0.8012638688087463, "learning_rate": 6.976378286916414e-07, "loss": 0.6933, "step": 4138 }, { "epoch": 0.8349834671407452, "grad_norm": 0.581827700138092, "learning_rate": 6.959738505773211e-07, "loss": 0.8131, "step": 4139 }, { "epoch": 0.8351852026969522, "grad_norm": 0.3581453263759613, "learning_rate": 6.943117108053265e-07, "loss": 0.8009, "step": 4140 }, { "epoch": 0.8353869382531592, "grad_norm": 0.396505743265152, "learning_rate": 6.926514100855964e-07, "loss": 0.6704, "step": 4141 }, { "epoch": 0.8355886738093662, "grad_norm": 0.38942739367485046, "learning_rate": 6.909929491272799e-07, "loss": 0.6357, "step": 4142 }, { "epoch": 0.8357904093655733, "grad_norm": 1.1855741739273071, "learning_rate": 6.893363286387405e-07, "loss": 0.6746, "step": 4143 }, { "epoch": 0.8359921449217802, "grad_norm": 0.3587276339530945, "learning_rate": 6.876815493275585e-07, "loss": 0.7073, "step": 4144 }, { "epoch": 0.8361938804779872, "grad_norm": 0.45886123180389404, "learning_rate": 6.860286119005255e-07, "loss": 0.7007, "step": 4145 }, { "epoch": 0.8363956160341942, "grad_norm": 0.48944273591041565, "learning_rate": 6.843775170636441e-07, "loss": 0.614, "step": 4146 }, { "epoch": 0.8365973515904012, "grad_norm": 0.4077273905277252, "learning_rate": 6.827282655221373e-07, "loss": 0.7222, "step": 4147 }, { "epoch": 0.8367990871466081, "grad_norm": 0.537196695804596, "learning_rate": 6.810808579804306e-07, "loss": 0.6385, "step": 4148 }, { "epoch": 0.8370008227028152, "grad_norm": 0.34784817695617676, "learning_rate": 6.794352951421695e-07, "loss": 0.6262, "step": 4149 }, { "epoch": 0.8372025582590221, "grad_norm": 0.5768676400184631, "learning_rate": 6.777915777102123e-07, "loss": 0.6929, "step": 4150 }, { "epoch": 0.8374042938152292, "grad_norm": 0.5870742201805115, "learning_rate": 6.761497063866207e-07, "loss": 0.7894, "step": 4151 }, { "epoch": 0.8376060293714361, "grad_norm": 1.8497966527938843, "learning_rate": 6.745096818726776e-07, "loss": 0.6915, "step": 4152 }, { "epoch": 0.8378077649276431, "grad_norm": 0.4410600960254669, "learning_rate": 6.728715048688711e-07, "loss": 0.6588, "step": 4153 }, { "epoch": 0.8380095004838501, "grad_norm": 0.4844304025173187, "learning_rate": 6.712351760749014e-07, "loss": 0.9008, "step": 4154 }, { "epoch": 0.8382112360400571, "grad_norm": 0.47696301341056824, "learning_rate": 6.696006961896812e-07, "loss": 0.7537, "step": 4155 }, { "epoch": 0.8384129715962642, "grad_norm": 0.46135079860687256, "learning_rate": 6.679680659113313e-07, "loss": 0.6742, "step": 4156 }, { "epoch": 0.8386147071524711, "grad_norm": 0.6165065169334412, "learning_rate": 6.66337285937183e-07, "loss": 0.6473, "step": 4157 }, { "epoch": 0.8388164427086781, "grad_norm": 0.6609275937080383, "learning_rate": 6.647083569637797e-07, "loss": 0.6924, "step": 4158 }, { "epoch": 0.8390181782648851, "grad_norm": 0.4120253026485443, "learning_rate": 6.630812796868679e-07, "loss": 0.7029, "step": 4159 }, { "epoch": 0.8392199138210921, "grad_norm": 0.7023719549179077, "learning_rate": 6.61456054801411e-07, "loss": 0.704, "step": 4160 }, { "epoch": 0.8394216493772991, "grad_norm": 0.3818644881248474, "learning_rate": 6.598326830015761e-07, "loss": 0.6771, "step": 4161 }, { "epoch": 0.8396233849335061, "grad_norm": 0.40819141268730164, "learning_rate": 6.582111649807399e-07, "loss": 0.6666, "step": 4162 }, { "epoch": 0.839825120489713, "grad_norm": 0.45097702741622925, "learning_rate": 6.565915014314895e-07, "loss": 0.7478, "step": 4163 }, { "epoch": 0.8400268560459201, "grad_norm": 0.4159963130950928, "learning_rate": 6.549736930456163e-07, "loss": 0.6349, "step": 4164 }, { "epoch": 0.840228591602127, "grad_norm": 0.7589116096496582, "learning_rate": 6.533577405141211e-07, "loss": 0.9, "step": 4165 }, { "epoch": 0.840430327158334, "grad_norm": 0.6266133189201355, "learning_rate": 6.517436445272135e-07, "loss": 0.6469, "step": 4166 }, { "epoch": 0.840632062714541, "grad_norm": 0.8853380084037781, "learning_rate": 6.501314057743085e-07, "loss": 0.6754, "step": 4167 }, { "epoch": 0.840833798270748, "grad_norm": 0.6288461089134216, "learning_rate": 6.48521024944026e-07, "loss": 0.6727, "step": 4168 }, { "epoch": 0.8410355338269551, "grad_norm": 2.7457149028778076, "learning_rate": 6.46912502724198e-07, "loss": 0.6679, "step": 4169 }, { "epoch": 0.841237269383162, "grad_norm": 1.6510045528411865, "learning_rate": 6.453058398018541e-07, "loss": 0.7959, "step": 4170 }, { "epoch": 0.841439004939369, "grad_norm": 0.7876770496368408, "learning_rate": 6.43701036863239e-07, "loss": 0.8301, "step": 4171 }, { "epoch": 0.841640740495576, "grad_norm": 0.2771237790584564, "learning_rate": 6.420980945937971e-07, "loss": 0.7384, "step": 4172 }, { "epoch": 0.841842476051783, "grad_norm": 0.6143053770065308, "learning_rate": 6.40497013678178e-07, "loss": 0.6745, "step": 4173 }, { "epoch": 0.84204421160799, "grad_norm": 0.3154948055744171, "learning_rate": 6.388977948002406e-07, "loss": 0.6943, "step": 4174 }, { "epoch": 0.842245947164197, "grad_norm": 0.3753282129764557, "learning_rate": 6.373004386430442e-07, "loss": 0.6293, "step": 4175 }, { "epoch": 0.8424476827204039, "grad_norm": 0.677807629108429, "learning_rate": 6.357049458888537e-07, "loss": 0.6477, "step": 4176 }, { "epoch": 0.842649418276611, "grad_norm": 0.4451962113380432, "learning_rate": 6.341113172191399e-07, "loss": 0.7745, "step": 4177 }, { "epoch": 0.842851153832818, "grad_norm": 0.425942599773407, "learning_rate": 6.325195533145751e-07, "loss": 0.6755, "step": 4178 }, { "epoch": 0.843052889389025, "grad_norm": 0.8061302900314331, "learning_rate": 6.309296548550359e-07, "loss": 0.6743, "step": 4179 }, { "epoch": 0.843254624945232, "grad_norm": 0.5910888314247131, "learning_rate": 6.293416225196009e-07, "loss": 0.6547, "step": 4180 }, { "epoch": 0.8434563605014389, "grad_norm": 0.42811521887779236, "learning_rate": 6.277554569865557e-07, "loss": 0.6636, "step": 4181 }, { "epoch": 0.843658096057646, "grad_norm": 0.5060998201370239, "learning_rate": 6.261711589333847e-07, "loss": 0.6498, "step": 4182 }, { "epoch": 0.8438598316138529, "grad_norm": 0.8295542001724243, "learning_rate": 6.245887290367752e-07, "loss": 0.6737, "step": 4183 }, { "epoch": 0.8440615671700599, "grad_norm": 0.34962189197540283, "learning_rate": 6.230081679726157e-07, "loss": 0.674, "step": 4184 }, { "epoch": 0.8442633027262669, "grad_norm": 0.47532859444618225, "learning_rate": 6.214294764160012e-07, "loss": 0.6495, "step": 4185 }, { "epoch": 0.8444650382824739, "grad_norm": 0.8541948199272156, "learning_rate": 6.198526550412232e-07, "loss": 0.7932, "step": 4186 }, { "epoch": 0.8446667738386809, "grad_norm": 0.4823712408542633, "learning_rate": 6.182777045217764e-07, "loss": 0.7874, "step": 4187 }, { "epoch": 0.8448685093948879, "grad_norm": 0.4878910779953003, "learning_rate": 6.167046255303543e-07, "loss": 0.7106, "step": 4188 }, { "epoch": 0.8450702449510948, "grad_norm": 1.4342825412750244, "learning_rate": 6.151334187388552e-07, "loss": 0.6694, "step": 4189 }, { "epoch": 0.8452719805073019, "grad_norm": 0.973486602306366, "learning_rate": 6.13564084818375e-07, "loss": 0.7091, "step": 4190 }, { "epoch": 0.8454737160635089, "grad_norm": 1.177097201347351, "learning_rate": 6.119966244392084e-07, "loss": 0.6511, "step": 4191 }, { "epoch": 0.8456754516197159, "grad_norm": 0.44476157426834106, "learning_rate": 6.104310382708539e-07, "loss": 0.7403, "step": 4192 }, { "epoch": 0.8458771871759229, "grad_norm": 0.37803569436073303, "learning_rate": 6.088673269820061e-07, "loss": 0.6753, "step": 4193 }, { "epoch": 0.8460789227321298, "grad_norm": 0.6805874109268188, "learning_rate": 6.073054912405591e-07, "loss": 0.8085, "step": 4194 }, { "epoch": 0.8462806582883369, "grad_norm": 0.6585365533828735, "learning_rate": 6.057455317136063e-07, "loss": 0.8038, "step": 4195 }, { "epoch": 0.8464823938445438, "grad_norm": 0.34971868991851807, "learning_rate": 6.041874490674416e-07, "loss": 0.649, "step": 4196 }, { "epoch": 0.8466841294007509, "grad_norm": 0.9895550012588501, "learning_rate": 6.026312439675553e-07, "loss": 0.8025, "step": 4197 }, { "epoch": 0.8468858649569578, "grad_norm": 0.4490432143211365, "learning_rate": 6.010769170786351e-07, "loss": 0.6689, "step": 4198 }, { "epoch": 0.8470876005131648, "grad_norm": 0.3404206335544586, "learning_rate": 5.995244690645679e-07, "loss": 0.6602, "step": 4199 }, { "epoch": 0.8472893360693718, "grad_norm": 0.5209308862686157, "learning_rate": 5.979739005884382e-07, "loss": 0.6826, "step": 4200 }, { "epoch": 0.8474910716255788, "grad_norm": 2.569395065307617, "learning_rate": 5.964252123125275e-07, "loss": 0.6958, "step": 4201 }, { "epoch": 0.8476928071817857, "grad_norm": 0.3720131814479828, "learning_rate": 5.948784048983125e-07, "loss": 0.6304, "step": 4202 }, { "epoch": 0.8478945427379928, "grad_norm": 0.30389517545700073, "learning_rate": 5.933334790064698e-07, "loss": 0.6829, "step": 4203 }, { "epoch": 0.8480962782941998, "grad_norm": 0.4956578314304352, "learning_rate": 5.917904352968695e-07, "loss": 0.7114, "step": 4204 }, { "epoch": 0.8482980138504068, "grad_norm": 0.576766848564148, "learning_rate": 5.902492744285776e-07, "loss": 0.6591, "step": 4205 }, { "epoch": 0.8484997494066138, "grad_norm": 0.4476238191127777, "learning_rate": 5.887099970598614e-07, "loss": 0.7116, "step": 4206 }, { "epoch": 0.8487014849628207, "grad_norm": 1.312357783317566, "learning_rate": 5.87172603848174e-07, "loss": 0.6716, "step": 4207 }, { "epoch": 0.8489032205190278, "grad_norm": 0.3982757329940796, "learning_rate": 5.856370954501722e-07, "loss": 0.6598, "step": 4208 }, { "epoch": 0.8491049560752347, "grad_norm": 0.3557843565940857, "learning_rate": 5.841034725217049e-07, "loss": 0.7635, "step": 4209 }, { "epoch": 0.8493066916314418, "grad_norm": 0.5288363099098206, "learning_rate": 5.82571735717814e-07, "loss": 0.7273, "step": 4210 }, { "epoch": 0.8495084271876487, "grad_norm": 0.38838717341423035, "learning_rate": 5.810418856927385e-07, "loss": 0.6438, "step": 4211 }, { "epoch": 0.8497101627438557, "grad_norm": 0.481218546628952, "learning_rate": 5.795139230999103e-07, "loss": 0.8037, "step": 4212 }, { "epoch": 0.8499118983000628, "grad_norm": 0.5310901999473572, "learning_rate": 5.779878485919538e-07, "loss": 0.7274, "step": 4213 }, { "epoch": 0.8501136338562697, "grad_norm": 0.46764230728149414, "learning_rate": 5.76463662820691e-07, "loss": 0.8457, "step": 4214 }, { "epoch": 0.8503153694124768, "grad_norm": 0.7696524858474731, "learning_rate": 5.749413664371312e-07, "loss": 0.7506, "step": 4215 }, { "epoch": 0.8505171049686837, "grad_norm": 0.5245659947395325, "learning_rate": 5.734209600914814e-07, "loss": 0.6286, "step": 4216 }, { "epoch": 0.8507188405248907, "grad_norm": 0.3059857487678528, "learning_rate": 5.719024444331422e-07, "loss": 0.7241, "step": 4217 }, { "epoch": 0.8509205760810977, "grad_norm": 0.43941038846969604, "learning_rate": 5.703858201107004e-07, "loss": 0.6359, "step": 4218 }, { "epoch": 0.8511223116373047, "grad_norm": 0.39626121520996094, "learning_rate": 5.688710877719417e-07, "loss": 0.6723, "step": 4219 }, { "epoch": 0.8513240471935117, "grad_norm": 0.630386233329773, "learning_rate": 5.673582480638395e-07, "loss": 0.7543, "step": 4220 }, { "epoch": 0.8515257827497187, "grad_norm": 0.34314990043640137, "learning_rate": 5.658473016325605e-07, "loss": 0.702, "step": 4221 }, { "epoch": 0.8517275183059256, "grad_norm": 0.5030164122581482, "learning_rate": 5.643382491234645e-07, "loss": 0.9247, "step": 4222 }, { "epoch": 0.8519292538621327, "grad_norm": 0.32690751552581787, "learning_rate": 5.628310911810969e-07, "loss": 0.6695, "step": 4223 }, { "epoch": 0.8521309894183396, "grad_norm": 0.3914249539375305, "learning_rate": 5.613258284491984e-07, "loss": 0.6439, "step": 4224 }, { "epoch": 0.8523327249745466, "grad_norm": 0.49372297525405884, "learning_rate": 5.598224615707026e-07, "loss": 0.6305, "step": 4225 }, { "epoch": 0.8525344605307537, "grad_norm": 0.5601586699485779, "learning_rate": 5.583209911877247e-07, "loss": 0.6901, "step": 4226 }, { "epoch": 0.8527361960869606, "grad_norm": 0.6586346626281738, "learning_rate": 5.568214179415787e-07, "loss": 0.651, "step": 4227 }, { "epoch": 0.8529379316431677, "grad_norm": 0.4916522204875946, "learning_rate": 5.553237424727631e-07, "loss": 0.6494, "step": 4228 }, { "epoch": 0.8531396671993746, "grad_norm": 1.484217643737793, "learning_rate": 5.538279654209666e-07, "loss": 0.6313, "step": 4229 }, { "epoch": 0.8533414027555816, "grad_norm": 0.574791431427002, "learning_rate": 5.523340874250704e-07, "loss": 0.6747, "step": 4230 }, { "epoch": 0.8535431383117886, "grad_norm": 0.5068508982658386, "learning_rate": 5.508421091231403e-07, "loss": 0.722, "step": 4231 }, { "epoch": 0.8537448738679956, "grad_norm": 0.5957054495811462, "learning_rate": 5.493520311524315e-07, "loss": 0.645, "step": 4232 }, { "epoch": 0.8539466094242026, "grad_norm": 0.31677567958831787, "learning_rate": 5.478638541493903e-07, "loss": 0.7713, "step": 4233 }, { "epoch": 0.8541483449804096, "grad_norm": 0.36994820833206177, "learning_rate": 5.463775787496484e-07, "loss": 0.6373, "step": 4234 }, { "epoch": 0.8543500805366165, "grad_norm": 0.35878241062164307, "learning_rate": 5.448932055880262e-07, "loss": 0.6331, "step": 4235 }, { "epoch": 0.8545518160928236, "grad_norm": 0.4238692820072174, "learning_rate": 5.434107352985313e-07, "loss": 0.6168, "step": 4236 }, { "epoch": 0.8547535516490306, "grad_norm": 0.4130517542362213, "learning_rate": 5.41930168514358e-07, "loss": 0.6304, "step": 4237 }, { "epoch": 0.8549552872052376, "grad_norm": 0.5014340877532959, "learning_rate": 5.404515058678894e-07, "loss": 0.7457, "step": 4238 }, { "epoch": 0.8551570227614446, "grad_norm": 0.610929548740387, "learning_rate": 5.389747479906943e-07, "loss": 0.6477, "step": 4239 }, { "epoch": 0.8553587583176515, "grad_norm": 0.364946186542511, "learning_rate": 5.374998955135258e-07, "loss": 0.6872, "step": 4240 }, { "epoch": 0.8555604938738586, "grad_norm": 0.39241814613342285, "learning_rate": 5.360269490663278e-07, "loss": 0.6155, "step": 4241 }, { "epoch": 0.8557622294300655, "grad_norm": 0.34818553924560547, "learning_rate": 5.345559092782266e-07, "loss": 0.6519, "step": 4242 }, { "epoch": 0.8559639649862725, "grad_norm": 0.3692716062068939, "learning_rate": 5.330867767775333e-07, "loss": 0.6915, "step": 4243 }, { "epoch": 0.8561657005424795, "grad_norm": 0.3212972581386566, "learning_rate": 5.316195521917484e-07, "loss": 0.7803, "step": 4244 }, { "epoch": 0.8563674360986865, "grad_norm": 0.43347883224487305, "learning_rate": 5.301542361475548e-07, "loss": 0.6335, "step": 4245 }, { "epoch": 0.8565691716548935, "grad_norm": 0.6067093014717102, "learning_rate": 5.286908292708198e-07, "loss": 0.6424, "step": 4246 }, { "epoch": 0.8567709072111005, "grad_norm": 0.6528324484825134, "learning_rate": 5.272293321865951e-07, "loss": 0.6596, "step": 4247 }, { "epoch": 0.8569726427673074, "grad_norm": 0.5397018790245056, "learning_rate": 5.257697455191197e-07, "loss": 0.6386, "step": 4248 }, { "epoch": 0.8571743783235145, "grad_norm": 0.4248422384262085, "learning_rate": 5.243120698918136e-07, "loss": 0.8655, "step": 4249 }, { "epoch": 0.8573761138797215, "grad_norm": 0.33575424551963806, "learning_rate": 5.228563059272812e-07, "loss": 0.7742, "step": 4250 }, { "epoch": 0.8575778494359285, "grad_norm": 0.5652782320976257, "learning_rate": 5.2140245424731e-07, "loss": 0.6833, "step": 4251 }, { "epoch": 0.8577795849921355, "grad_norm": 0.41048964858055115, "learning_rate": 5.199505154728729e-07, "loss": 0.6791, "step": 4252 }, { "epoch": 0.8579813205483424, "grad_norm": 0.7275264859199524, "learning_rate": 5.185004902241241e-07, "loss": 0.6563, "step": 4253 }, { "epoch": 0.8581830561045495, "grad_norm": 0.5408958196640015, "learning_rate": 5.170523791204002e-07, "loss": 0.6719, "step": 4254 }, { "epoch": 0.8583847916607564, "grad_norm": 0.3260270357131958, "learning_rate": 5.156061827802195e-07, "loss": 0.8123, "step": 4255 }, { "epoch": 0.8585865272169635, "grad_norm": 0.5259901285171509, "learning_rate": 5.141619018212851e-07, "loss": 0.656, "step": 4256 }, { "epoch": 0.8587882627731704, "grad_norm": 0.3833652436733246, "learning_rate": 5.127195368604809e-07, "loss": 0.6741, "step": 4257 }, { "epoch": 0.8589899983293774, "grad_norm": 0.8140313029289246, "learning_rate": 5.112790885138703e-07, "loss": 0.8541, "step": 4258 }, { "epoch": 0.8591917338855845, "grad_norm": 0.42131373286247253, "learning_rate": 5.098405573967013e-07, "loss": 0.677, "step": 4259 }, { "epoch": 0.8593934694417914, "grad_norm": 0.4936279356479645, "learning_rate": 5.084039441234013e-07, "loss": 0.7666, "step": 4260 }, { "epoch": 0.8595952049979984, "grad_norm": 0.3809494376182556, "learning_rate": 5.069692493075778e-07, "loss": 0.645, "step": 4261 }, { "epoch": 0.8597969405542054, "grad_norm": 0.7333827018737793, "learning_rate": 5.055364735620222e-07, "loss": 0.6462, "step": 4262 }, { "epoch": 0.8599986761104124, "grad_norm": 0.3328392207622528, "learning_rate": 5.041056174987008e-07, "loss": 0.6537, "step": 4263 }, { "epoch": 0.8602004116666194, "grad_norm": 0.4985945522785187, "learning_rate": 5.026766817287654e-07, "loss": 0.6892, "step": 4264 }, { "epoch": 0.8604021472228264, "grad_norm": 0.6473126411437988, "learning_rate": 5.012496668625444e-07, "loss": 0.7757, "step": 4265 }, { "epoch": 0.8606038827790333, "grad_norm": 0.5193613767623901, "learning_rate": 4.998245735095459e-07, "loss": 0.6941, "step": 4266 }, { "epoch": 0.8608056183352404, "grad_norm": 0.5102809071540833, "learning_rate": 4.984014022784595e-07, "loss": 0.8465, "step": 4267 }, { "epoch": 0.8610073538914473, "grad_norm": 0.4982369542121887, "learning_rate": 4.969801537771512e-07, "loss": 0.717, "step": 4268 }, { "epoch": 0.8612090894476544, "grad_norm": 0.6398607492446899, "learning_rate": 4.955608286126673e-07, "loss": 0.6681, "step": 4269 }, { "epoch": 0.8614108250038613, "grad_norm": 0.37193191051483154, "learning_rate": 4.941434273912321e-07, "loss": 0.7803, "step": 4270 }, { "epoch": 0.8616125605600683, "grad_norm": 0.31802332401275635, "learning_rate": 4.927279507182486e-07, "loss": 0.6695, "step": 4271 }, { "epoch": 0.8618142961162754, "grad_norm": 0.5764533877372742, "learning_rate": 4.91314399198296e-07, "loss": 0.6397, "step": 4272 }, { "epoch": 0.8620160316724823, "grad_norm": 0.360847532749176, "learning_rate": 4.899027734351358e-07, "loss": 0.719, "step": 4273 }, { "epoch": 0.8622177672286894, "grad_norm": 0.7284826636314392, "learning_rate": 4.88493074031699e-07, "loss": 0.8214, "step": 4274 }, { "epoch": 0.8624195027848963, "grad_norm": 0.44335803389549255, "learning_rate": 4.870853015901028e-07, "loss": 1.1041, "step": 4275 }, { "epoch": 0.8626212383411033, "grad_norm": 0.5888170599937439, "learning_rate": 4.856794567116352e-07, "loss": 0.6641, "step": 4276 }, { "epoch": 0.8628229738973103, "grad_norm": 0.48649922013282776, "learning_rate": 4.842755399967625e-07, "loss": 0.6491, "step": 4277 }, { "epoch": 0.8630247094535173, "grad_norm": 0.9031432271003723, "learning_rate": 4.828735520451294e-07, "loss": 0.6338, "step": 4278 }, { "epoch": 0.8632264450097242, "grad_norm": 0.7475653886795044, "learning_rate": 4.814734934555543e-07, "loss": 0.7146, "step": 4279 }, { "epoch": 0.8634281805659313, "grad_norm": 0.7083207368850708, "learning_rate": 4.800753648260309e-07, "loss": 0.6607, "step": 4280 }, { "epoch": 0.8636299161221382, "grad_norm": 0.5013923645019531, "learning_rate": 4.786791667537338e-07, "loss": 0.6543, "step": 4281 }, { "epoch": 0.8638316516783453, "grad_norm": 0.6404758095741272, "learning_rate": 4.772848998350049e-07, "loss": 0.7574, "step": 4282 }, { "epoch": 0.8640333872345523, "grad_norm": 1.0840024948120117, "learning_rate": 4.7589256466536835e-07, "loss": 0.6509, "step": 4283 }, { "epoch": 0.8642351227907592, "grad_norm": 0.3333173990249634, "learning_rate": 4.7450216183952127e-07, "loss": 0.7079, "step": 4284 }, { "epoch": 0.8644368583469663, "grad_norm": 0.4083701968193054, "learning_rate": 4.7311369195133127e-07, "loss": 0.6664, "step": 4285 }, { "epoch": 0.8646385939031732, "grad_norm": 0.37062057852745056, "learning_rate": 4.717271555938474e-07, "loss": 0.7771, "step": 4286 }, { "epoch": 0.8648403294593803, "grad_norm": 0.8348171710968018, "learning_rate": 4.7034255335928704e-07, "loss": 0.6784, "step": 4287 }, { "epoch": 0.8650420650155872, "grad_norm": 0.3087136745452881, "learning_rate": 4.689598858390432e-07, "loss": 0.6582, "step": 4288 }, { "epoch": 0.8652438005717942, "grad_norm": 0.5123614072799683, "learning_rate": 4.6757915362368567e-07, "loss": 0.6496, "step": 4289 }, { "epoch": 0.8654455361280012, "grad_norm": 0.5696089863777161, "learning_rate": 4.6620035730295277e-07, "loss": 0.6654, "step": 4290 }, { "epoch": 0.8656472716842082, "grad_norm": 0.4323718845844269, "learning_rate": 4.6482349746575783e-07, "loss": 0.6986, "step": 4291 }, { "epoch": 0.8658490072404152, "grad_norm": 0.9498506188392639, "learning_rate": 4.634485747001899e-07, "loss": 0.6253, "step": 4292 }, { "epoch": 0.8660507427966222, "grad_norm": 0.4021576941013336, "learning_rate": 4.620755895935042e-07, "loss": 0.6582, "step": 4293 }, { "epoch": 0.8662524783528291, "grad_norm": 0.41058310866355896, "learning_rate": 4.6070454273213605e-07, "loss": 0.6798, "step": 4294 }, { "epoch": 0.8664542139090362, "grad_norm": 0.7499169707298279, "learning_rate": 4.5933543470168706e-07, "loss": 0.6811, "step": 4295 }, { "epoch": 0.8666559494652432, "grad_norm": 0.6942950487136841, "learning_rate": 4.5796826608693277e-07, "loss": 0.6765, "step": 4296 }, { "epoch": 0.8668576850214501, "grad_norm": 0.511210560798645, "learning_rate": 4.566030374718211e-07, "loss": 0.8112, "step": 4297 }, { "epoch": 0.8670594205776572, "grad_norm": 0.3508337140083313, "learning_rate": 4.552397494394706e-07, "loss": 0.7067, "step": 4298 }, { "epoch": 0.8672611561338641, "grad_norm": 1.2191749811172485, "learning_rate": 4.5387840257216987e-07, "loss": 0.7768, "step": 4299 }, { "epoch": 0.8674628916900712, "grad_norm": 0.6152402758598328, "learning_rate": 4.5251899745138104e-07, "loss": 0.6522, "step": 4300 }, { "epoch": 0.8676646272462781, "grad_norm": 0.3791225254535675, "learning_rate": 4.5116153465773525e-07, "loss": 0.658, "step": 4301 }, { "epoch": 0.8678663628024851, "grad_norm": 0.7504114508628845, "learning_rate": 4.4980601477103257e-07, "loss": 0.6723, "step": 4302 }, { "epoch": 0.8680680983586921, "grad_norm": 0.520866870880127, "learning_rate": 4.4845243837024543e-07, "loss": 0.6935, "step": 4303 }, { "epoch": 0.8682698339148991, "grad_norm": 0.8700990676879883, "learning_rate": 4.4710080603351634e-07, "loss": 0.6538, "step": 4304 }, { "epoch": 0.8684715694711062, "grad_norm": 0.39416974782943726, "learning_rate": 4.457511183381563e-07, "loss": 0.7885, "step": 4305 }, { "epoch": 0.8686733050273131, "grad_norm": 0.7392798662185669, "learning_rate": 4.444033758606453e-07, "loss": 0.6649, "step": 4306 }, { "epoch": 0.86887504058352, "grad_norm": 0.6805448532104492, "learning_rate": 4.4305757917663284e-07, "loss": 0.8631, "step": 4307 }, { "epoch": 0.8690767761397271, "grad_norm": 0.7489762902259827, "learning_rate": 4.4171372886093967e-07, "loss": 0.7059, "step": 4308 }, { "epoch": 0.8692785116959341, "grad_norm": 0.5760902762413025, "learning_rate": 4.4037182548755166e-07, "loss": 0.7396, "step": 4309 }, { "epoch": 0.8694802472521411, "grad_norm": 0.8316147327423096, "learning_rate": 4.390318696296247e-07, "loss": 0.6357, "step": 4310 }, { "epoch": 0.8696819828083481, "grad_norm": 0.516345739364624, "learning_rate": 4.376938618594828e-07, "loss": 0.6736, "step": 4311 }, { "epoch": 0.869883718364555, "grad_norm": 1.7527180910110474, "learning_rate": 4.363578027486187e-07, "loss": 0.6342, "step": 4312 }, { "epoch": 0.8700854539207621, "grad_norm": 0.6879702806472778, "learning_rate": 4.3502369286769154e-07, "loss": 0.6328, "step": 4313 }, { "epoch": 0.870287189476969, "grad_norm": 0.8283377289772034, "learning_rate": 4.3369153278652765e-07, "loss": 0.6603, "step": 4314 }, { "epoch": 0.8704889250331761, "grad_norm": 1.3900506496429443, "learning_rate": 4.323613230741236e-07, "loss": 0.8108, "step": 4315 }, { "epoch": 0.870690660589383, "grad_norm": 0.36297035217285156, "learning_rate": 4.310330642986382e-07, "loss": 0.7027, "step": 4316 }, { "epoch": 0.87089239614559, "grad_norm": 0.6657307744026184, "learning_rate": 4.2970675702739997e-07, "loss": 0.6216, "step": 4317 }, { "epoch": 0.8710941317017971, "grad_norm": 0.40114277601242065, "learning_rate": 4.283824018269045e-07, "loss": 0.878, "step": 4318 }, { "epoch": 0.871295867258004, "grad_norm": 0.3651154041290283, "learning_rate": 4.270599992628116e-07, "loss": 0.678, "step": 4319 }, { "epoch": 0.871497602814211, "grad_norm": 0.4124424457550049, "learning_rate": 4.257395498999478e-07, "loss": 0.6754, "step": 4320 }, { "epoch": 0.871699338370418, "grad_norm": 0.42094066739082336, "learning_rate": 4.244210543023053e-07, "loss": 0.6665, "step": 4321 }, { "epoch": 0.871901073926625, "grad_norm": 0.35365965962409973, "learning_rate": 4.231045130330419e-07, "loss": 0.7355, "step": 4322 }, { "epoch": 0.872102809482832, "grad_norm": 0.3624404966831207, "learning_rate": 4.2178992665448226e-07, "loss": 0.7504, "step": 4323 }, { "epoch": 0.872304545039039, "grad_norm": 0.6722658276557922, "learning_rate": 4.204772957281128e-07, "loss": 0.7163, "step": 4324 }, { "epoch": 0.8725062805952459, "grad_norm": 0.3923650085926056, "learning_rate": 4.191666208145867e-07, "loss": 0.6459, "step": 4325 }, { "epoch": 0.872708016151453, "grad_norm": 0.42511051893234253, "learning_rate": 4.1785790247372226e-07, "loss": 0.7555, "step": 4326 }, { "epoch": 0.8729097517076599, "grad_norm": 0.4749056398868561, "learning_rate": 4.1655114126450125e-07, "loss": 0.661, "step": 4327 }, { "epoch": 0.873111487263867, "grad_norm": 0.31862133741378784, "learning_rate": 4.152463377450683e-07, "loss": 0.6398, "step": 4328 }, { "epoch": 0.873313222820074, "grad_norm": 0.48377200961112976, "learning_rate": 4.139434924727359e-07, "loss": 0.6006, "step": 4329 }, { "epoch": 0.8735149583762809, "grad_norm": 0.43817344307899475, "learning_rate": 4.1264260600397343e-07, "loss": 0.6488, "step": 4330 }, { "epoch": 0.873716693932488, "grad_norm": 0.8320906758308411, "learning_rate": 4.113436788944197e-07, "loss": 0.6923, "step": 4331 }, { "epoch": 0.8739184294886949, "grad_norm": 0.6215797066688538, "learning_rate": 4.10046711698876e-07, "loss": 0.7747, "step": 4332 }, { "epoch": 0.874120165044902, "grad_norm": 0.45883363485336304, "learning_rate": 4.0875170497130135e-07, "loss": 0.697, "step": 4333 }, { "epoch": 0.8743219006011089, "grad_norm": 0.4302061200141907, "learning_rate": 4.074586592648244e-07, "loss": 0.6485, "step": 4334 }, { "epoch": 0.8745236361573159, "grad_norm": 0.4158835709095001, "learning_rate": 4.0616757513173123e-07, "loss": 0.6602, "step": 4335 }, { "epoch": 0.8747253717135229, "grad_norm": 0.5009157061576843, "learning_rate": 4.048784531234706e-07, "loss": 0.6721, "step": 4336 }, { "epoch": 0.8749271072697299, "grad_norm": 0.37360620498657227, "learning_rate": 4.035912937906578e-07, "loss": 0.6843, "step": 4337 }, { "epoch": 0.8751288428259368, "grad_norm": 0.447135865688324, "learning_rate": 4.023060976830623e-07, "loss": 0.6044, "step": 4338 }, { "epoch": 0.8753305783821439, "grad_norm": 0.336240291595459, "learning_rate": 4.010228653496207e-07, "loss": 0.8823, "step": 4339 }, { "epoch": 0.8755323139383508, "grad_norm": 0.338085800409317, "learning_rate": 3.997415973384311e-07, "loss": 0.6601, "step": 4340 }, { "epoch": 0.8757340494945579, "grad_norm": 0.6273344159126282, "learning_rate": 3.9846229419674754e-07, "loss": 0.6359, "step": 4341 }, { "epoch": 0.8759357850507649, "grad_norm": 0.42052680253982544, "learning_rate": 3.9718495647099007e-07, "loss": 0.7733, "step": 4342 }, { "epoch": 0.8761375206069718, "grad_norm": 0.373045951128006, "learning_rate": 3.9590958470673626e-07, "loss": 0.7387, "step": 4343 }, { "epoch": 0.8763392561631789, "grad_norm": 0.926173985004425, "learning_rate": 3.9463617944872465e-07, "loss": 0.6867, "step": 4344 }, { "epoch": 0.8765409917193858, "grad_norm": 0.5256276726722717, "learning_rate": 3.933647412408548e-07, "loss": 0.6511, "step": 4345 }, { "epoch": 0.8767427272755929, "grad_norm": 0.8077823519706726, "learning_rate": 3.920952706261855e-07, "loss": 0.7477, "step": 4346 }, { "epoch": 0.8769444628317998, "grad_norm": 0.567271888256073, "learning_rate": 3.9082776814693355e-07, "loss": 0.798, "step": 4347 }, { "epoch": 0.8771461983880068, "grad_norm": 0.3140374720096588, "learning_rate": 3.8956223434447936e-07, "loss": 0.7335, "step": 4348 }, { "epoch": 0.8773479339442138, "grad_norm": 0.9564724564552307, "learning_rate": 3.8829866975935603e-07, "loss": 0.7208, "step": 4349 }, { "epoch": 0.8775496695004208, "grad_norm": 0.6215482354164124, "learning_rate": 3.870370749312624e-07, "loss": 0.7061, "step": 4350 }, { "epoch": 0.8777514050566279, "grad_norm": 0.33347561955451965, "learning_rate": 3.857774503990513e-07, "loss": 0.7846, "step": 4351 }, { "epoch": 0.8779531406128348, "grad_norm": 1.3723175525665283, "learning_rate": 3.845197967007347e-07, "loss": 0.656, "step": 4352 }, { "epoch": 0.8781548761690418, "grad_norm": 0.5448706746101379, "learning_rate": 3.832641143734861e-07, "loss": 0.6686, "step": 4353 }, { "epoch": 0.8783566117252488, "grad_norm": 0.7323155999183655, "learning_rate": 3.820104039536326e-07, "loss": 0.6493, "step": 4354 }, { "epoch": 0.8785583472814558, "grad_norm": 1.221942663192749, "learning_rate": 3.8075866597666044e-07, "loss": 0.7042, "step": 4355 }, { "epoch": 0.8787600828376627, "grad_norm": 0.4284684360027313, "learning_rate": 3.795089009772157e-07, "loss": 0.7094, "step": 4356 }, { "epoch": 0.8789618183938698, "grad_norm": 0.5355064272880554, "learning_rate": 3.782611094890992e-07, "loss": 0.646, "step": 4357 }, { "epoch": 0.8791635539500767, "grad_norm": 0.4629518687725067, "learning_rate": 3.7701529204526856e-07, "loss": 0.6725, "step": 4358 }, { "epoch": 0.8793652895062838, "grad_norm": 0.356670618057251, "learning_rate": 3.757714491778419e-07, "loss": 0.6576, "step": 4359 }, { "epoch": 0.8795670250624907, "grad_norm": 0.5200281143188477, "learning_rate": 3.745295814180877e-07, "loss": 0.6276, "step": 4360 }, { "epoch": 0.8797687606186977, "grad_norm": 0.5747634768486023, "learning_rate": 3.7328968929643714e-07, "loss": 0.7729, "step": 4361 }, { "epoch": 0.8799704961749047, "grad_norm": 0.41098442673683167, "learning_rate": 3.7205177334247445e-07, "loss": 0.682, "step": 4362 }, { "epoch": 0.8801722317311117, "grad_norm": 0.3811708986759186, "learning_rate": 3.7081583408493883e-07, "loss": 0.7493, "step": 4363 }, { "epoch": 0.8803739672873188, "grad_norm": 0.5595073103904724, "learning_rate": 3.69581872051728e-07, "loss": 0.793, "step": 4364 }, { "epoch": 0.8805757028435257, "grad_norm": 0.410376638174057, "learning_rate": 3.6834988776989323e-07, "loss": 0.6694, "step": 4365 }, { "epoch": 0.8807774383997327, "grad_norm": 0.36430269479751587, "learning_rate": 3.671198817656413e-07, "loss": 0.6765, "step": 4366 }, { "epoch": 0.8809791739559397, "grad_norm": 0.6053842902183533, "learning_rate": 3.658918545643353e-07, "loss": 0.8619, "step": 4367 }, { "epoch": 0.8811809095121467, "grad_norm": 0.6663161516189575, "learning_rate": 3.6466580669049123e-07, "loss": 0.7408, "step": 4368 }, { "epoch": 0.8813826450683537, "grad_norm": 0.41885098814964294, "learning_rate": 3.6344173866778075e-07, "loss": 0.6584, "step": 4369 }, { "epoch": 0.8815843806245607, "grad_norm": 0.5261691808700562, "learning_rate": 3.62219651019029e-07, "loss": 0.6922, "step": 4370 }, { "epoch": 0.8817861161807676, "grad_norm": 0.4592227339744568, "learning_rate": 3.609995442662173e-07, "loss": 0.679, "step": 4371 }, { "epoch": 0.8819878517369747, "grad_norm": 0.4438103139400482, "learning_rate": 3.597814189304788e-07, "loss": 0.7043, "step": 4372 }, { "epoch": 0.8821895872931816, "grad_norm": 0.5406025052070618, "learning_rate": 3.585652755321012e-07, "loss": 0.733, "step": 4373 }, { "epoch": 0.8823913228493886, "grad_norm": 0.3534887731075287, "learning_rate": 3.573511145905245e-07, "loss": 0.6839, "step": 4374 }, { "epoch": 0.8825930584055957, "grad_norm": 0.4145587980747223, "learning_rate": 3.561389366243451e-07, "loss": 0.7142, "step": 4375 }, { "epoch": 0.8827947939618026, "grad_norm": 0.5214899182319641, "learning_rate": 3.5492874215130926e-07, "loss": 0.766, "step": 4376 }, { "epoch": 0.8829965295180097, "grad_norm": 0.38252052664756775, "learning_rate": 3.5372053168831744e-07, "loss": 0.6699, "step": 4377 }, { "epoch": 0.8831982650742166, "grad_norm": 0.5145108699798584, "learning_rate": 3.5251430575142074e-07, "loss": 0.6906, "step": 4378 }, { "epoch": 0.8834000006304236, "grad_norm": 0.27400287985801697, "learning_rate": 3.5131006485582653e-07, "loss": 0.6277, "step": 4379 }, { "epoch": 0.8836017361866306, "grad_norm": 0.4707011878490448, "learning_rate": 3.501078095158911e-07, "loss": 0.6461, "step": 4380 }, { "epoch": 0.8838034717428376, "grad_norm": 0.3479291498661041, "learning_rate": 3.4890754024512254e-07, "loss": 0.7901, "step": 4381 }, { "epoch": 0.8840052072990446, "grad_norm": 0.6620512008666992, "learning_rate": 3.477092575561836e-07, "loss": 0.6257, "step": 4382 }, { "epoch": 0.8842069428552516, "grad_norm": 0.3701179623603821, "learning_rate": 3.465129619608859e-07, "loss": 0.6799, "step": 4383 }, { "epoch": 0.8844086784114585, "grad_norm": 0.876181423664093, "learning_rate": 3.453186539701925e-07, "loss": 0.719, "step": 4384 }, { "epoch": 0.8846104139676656, "grad_norm": 0.9063430428504944, "learning_rate": 3.441263340942197e-07, "loss": 0.6176, "step": 4385 }, { "epoch": 0.8848121495238725, "grad_norm": 0.4452477693557739, "learning_rate": 3.429360028422307e-07, "loss": 0.6694, "step": 4386 }, { "epoch": 0.8850138850800796, "grad_norm": 0.3298512399196625, "learning_rate": 3.4174766072264333e-07, "loss": 0.6696, "step": 4387 }, { "epoch": 0.8852156206362866, "grad_norm": 0.8097718358039856, "learning_rate": 3.405613082430237e-07, "loss": 0.6662, "step": 4388 }, { "epoch": 0.8854173561924935, "grad_norm": 0.39331236481666565, "learning_rate": 3.393769459100876e-07, "loss": 0.7255, "step": 4389 }, { "epoch": 0.8856190917487006, "grad_norm": 0.33854588866233826, "learning_rate": 3.3819457422970327e-07, "loss": 0.7015, "step": 4390 }, { "epoch": 0.8858208273049075, "grad_norm": 0.4164811074733734, "learning_rate": 3.3701419370688657e-07, "loss": 0.6659, "step": 4391 }, { "epoch": 0.8860225628611145, "grad_norm": 0.40010184049606323, "learning_rate": 3.3583580484580215e-07, "loss": 0.779, "step": 4392 }, { "epoch": 0.8862242984173215, "grad_norm": 0.3427859842777252, "learning_rate": 3.3465940814976784e-07, "loss": 0.6434, "step": 4393 }, { "epoch": 0.8864260339735285, "grad_norm": 0.4118961691856384, "learning_rate": 3.334850041212462e-07, "loss": 0.6414, "step": 4394 }, { "epoch": 0.8866277695297355, "grad_norm": 0.576214611530304, "learning_rate": 3.3231259326184983e-07, "loss": 0.7513, "step": 4395 }, { "epoch": 0.8868295050859425, "grad_norm": 0.45370420813560486, "learning_rate": 3.311421760723438e-07, "loss": 0.7095, "step": 4396 }, { "epoch": 0.8870312406421494, "grad_norm": 0.37329503893852234, "learning_rate": 3.299737530526348e-07, "loss": 0.6451, "step": 4397 }, { "epoch": 0.8872329761983565, "grad_norm": 0.4043911099433899, "learning_rate": 3.2880732470178366e-07, "loss": 0.6876, "step": 4398 }, { "epoch": 0.8874347117545635, "grad_norm": 0.4774644672870636, "learning_rate": 3.276428915179969e-07, "loss": 0.7071, "step": 4399 }, { "epoch": 0.8876364473107705, "grad_norm": 0.30243927240371704, "learning_rate": 3.264804539986283e-07, "loss": 0.6641, "step": 4400 }, { "epoch": 0.8878381828669775, "grad_norm": 0.7250840067863464, "learning_rate": 3.2532001264018067e-07, "loss": 0.8407, "step": 4401 }, { "epoch": 0.8880399184231844, "grad_norm": 1.1073092222213745, "learning_rate": 3.241615679383031e-07, "loss": 0.6304, "step": 4402 }, { "epoch": 0.8882416539793915, "grad_norm": 0.6125013828277588, "learning_rate": 3.2300512038779155e-07, "loss": 0.849, "step": 4403 }, { "epoch": 0.8884433895355984, "grad_norm": 0.3922661542892456, "learning_rate": 3.2185067048259245e-07, "loss": 0.7053, "step": 4404 }, { "epoch": 0.8886451250918055, "grad_norm": 0.5089098215103149, "learning_rate": 3.2069821871579255e-07, "loss": 0.7376, "step": 4405 }, { "epoch": 0.8888468606480124, "grad_norm": 4.080984592437744, "learning_rate": 3.1954776557963086e-07, "loss": 0.6724, "step": 4406 }, { "epoch": 0.8890485962042194, "grad_norm": 0.42470672726631165, "learning_rate": 3.183993115654921e-07, "loss": 0.8208, "step": 4407 }, { "epoch": 0.8892503317604264, "grad_norm": 0.38804891705513, "learning_rate": 3.172528571639022e-07, "loss": 0.6946, "step": 4408 }, { "epoch": 0.8894520673166334, "grad_norm": 0.7295994758605957, "learning_rate": 3.161084028645395e-07, "loss": 0.7716, "step": 4409 }, { "epoch": 0.8896538028728405, "grad_norm": 0.3722836971282959, "learning_rate": 3.1496594915622405e-07, "loss": 0.6619, "step": 4410 }, { "epoch": 0.8898555384290474, "grad_norm": 0.42691469192504883, "learning_rate": 3.1382549652692164e-07, "loss": 0.7169, "step": 4411 }, { "epoch": 0.8900572739852544, "grad_norm": 0.5380913615226746, "learning_rate": 3.126870454637454e-07, "loss": 0.6632, "step": 4412 }, { "epoch": 0.8902590095414614, "grad_norm": 0.4028385579586029, "learning_rate": 3.115505964529519e-07, "loss": 0.7287, "step": 4413 }, { "epoch": 0.8904607450976684, "grad_norm": 0.45696067810058594, "learning_rate": 3.1041614997994295e-07, "loss": 0.6113, "step": 4414 }, { "epoch": 0.8906624806538753, "grad_norm": 1.2479190826416016, "learning_rate": 3.0928370652926586e-07, "loss": 0.731, "step": 4415 }, { "epoch": 0.8908642162100824, "grad_norm": 0.767214298248291, "learning_rate": 3.0815326658460986e-07, "loss": 0.6698, "step": 4416 }, { "epoch": 0.8910659517662893, "grad_norm": 0.46267926692962646, "learning_rate": 3.0702483062881206e-07, "loss": 0.7059, "step": 4417 }, { "epoch": 0.8912676873224964, "grad_norm": 0.751915693283081, "learning_rate": 3.058983991438508e-07, "loss": 0.7794, "step": 4418 }, { "epoch": 0.8914694228787033, "grad_norm": 0.42991894483566284, "learning_rate": 3.047739726108484e-07, "loss": 0.6285, "step": 4419 }, { "epoch": 0.8916711584349103, "grad_norm": 0.40284815430641174, "learning_rate": 3.036515515100735e-07, "loss": 0.6686, "step": 4420 }, { "epoch": 0.8918728939911174, "grad_norm": 0.3892557919025421, "learning_rate": 3.02531136320936e-07, "loss": 0.6799, "step": 4421 }, { "epoch": 0.8920746295473243, "grad_norm": 0.5194017291069031, "learning_rate": 3.01412727521988e-07, "loss": 0.6733, "step": 4422 }, { "epoch": 0.8922763651035314, "grad_norm": 0.36624839901924133, "learning_rate": 3.0029632559092747e-07, "loss": 0.8639, "step": 4423 }, { "epoch": 0.8924781006597383, "grad_norm": 0.36247923970222473, "learning_rate": 2.991819310045929e-07, "loss": 0.638, "step": 4424 }, { "epoch": 0.8926798362159453, "grad_norm": 0.32094642519950867, "learning_rate": 2.9806954423896696e-07, "loss": 0.6623, "step": 4425 }, { "epoch": 0.8928815717721523, "grad_norm": 0.5981245040893555, "learning_rate": 2.9695916576917285e-07, "loss": 0.7036, "step": 4426 }, { "epoch": 0.8930833073283593, "grad_norm": 0.4451741874217987, "learning_rate": 2.9585079606947843e-07, "loss": 0.8021, "step": 4427 }, { "epoch": 0.8932850428845663, "grad_norm": 0.39480602741241455, "learning_rate": 2.947444356132917e-07, "loss": 0.8903, "step": 4428 }, { "epoch": 0.8934867784407733, "grad_norm": 0.6016244292259216, "learning_rate": 2.93640084873163e-07, "loss": 0.6515, "step": 4429 }, { "epoch": 0.8936885139969802, "grad_norm": 0.34783855080604553, "learning_rate": 2.9253774432078384e-07, "loss": 0.9107, "step": 4430 }, { "epoch": 0.8938902495531873, "grad_norm": 0.42876172065734863, "learning_rate": 2.914374144269888e-07, "loss": 0.6776, "step": 4431 }, { "epoch": 0.8940919851093942, "grad_norm": 0.909610390663147, "learning_rate": 2.903390956617519e-07, "loss": 0.7213, "step": 4432 }, { "epoch": 0.8942937206656012, "grad_norm": 1.040679693222046, "learning_rate": 2.8924278849418784e-07, "loss": 0.7602, "step": 4433 }, { "epoch": 0.8944954562218083, "grad_norm": 0.5264043211936951, "learning_rate": 2.881484933925549e-07, "loss": 0.649, "step": 4434 }, { "epoch": 0.8946971917780152, "grad_norm": 0.4022894501686096, "learning_rate": 2.870562108242486e-07, "loss": 0.7097, "step": 4435 }, { "epoch": 0.8948989273342223, "grad_norm": 0.41743919253349304, "learning_rate": 2.8596594125580745e-07, "loss": 0.6597, "step": 4436 }, { "epoch": 0.8951006628904292, "grad_norm": 0.8420581221580505, "learning_rate": 2.8487768515290783e-07, "loss": 0.6856, "step": 4437 }, { "epoch": 0.8953023984466362, "grad_norm": 0.7341222763061523, "learning_rate": 2.8379144298036845e-07, "loss": 0.774, "step": 4438 }, { "epoch": 0.8955041340028432, "grad_norm": 3.0229249000549316, "learning_rate": 2.827072152021465e-07, "loss": 0.6685, "step": 4439 }, { "epoch": 0.8957058695590502, "grad_norm": 0.44219347834587097, "learning_rate": 2.816250022813383e-07, "loss": 0.8718, "step": 4440 }, { "epoch": 0.8959076051152572, "grad_norm": 0.7286803722381592, "learning_rate": 2.8054480468018117e-07, "loss": 0.6332, "step": 4441 }, { "epoch": 0.8961093406714642, "grad_norm": 0.4334128797054291, "learning_rate": 2.7946662286005124e-07, "loss": 0.6492, "step": 4442 }, { "epoch": 0.8963110762276711, "grad_norm": 0.45339667797088623, "learning_rate": 2.783904572814622e-07, "loss": 0.7465, "step": 4443 }, { "epoch": 0.8965128117838782, "grad_norm": 0.5342560410499573, "learning_rate": 2.7731630840406754e-07, "loss": 0.6591, "step": 4444 }, { "epoch": 0.8967145473400852, "grad_norm": 0.3707166910171509, "learning_rate": 2.7624417668665917e-07, "loss": 0.6863, "step": 4445 }, { "epoch": 0.8969162828962922, "grad_norm": 0.4466363787651062, "learning_rate": 2.751740625871691e-07, "loss": 0.727, "step": 4446 }, { "epoch": 0.8971180184524992, "grad_norm": 0.5713307857513428, "learning_rate": 2.7410596656266497e-07, "loss": 0.6592, "step": 4447 }, { "epoch": 0.8973197540087061, "grad_norm": 0.4353967010974884, "learning_rate": 2.730398890693536e-07, "loss": 0.6307, "step": 4448 }, { "epoch": 0.8975214895649132, "grad_norm": 0.4011313021183014, "learning_rate": 2.7197583056258027e-07, "loss": 0.6981, "step": 4449 }, { "epoch": 0.8977232251211201, "grad_norm": 0.37026599049568176, "learning_rate": 2.7091379149682683e-07, "loss": 0.8362, "step": 4450 }, { "epoch": 0.8979249606773271, "grad_norm": 0.4338156580924988, "learning_rate": 2.698537723257127e-07, "loss": 0.7418, "step": 4451 }, { "epoch": 0.8981266962335341, "grad_norm": 0.539256751537323, "learning_rate": 2.687957735019969e-07, "loss": 0.6242, "step": 4452 }, { "epoch": 0.8983284317897411, "grad_norm": 0.4187043607234955, "learning_rate": 2.6773979547757013e-07, "loss": 0.6234, "step": 4453 }, { "epoch": 0.8985301673459481, "grad_norm": 0.34601306915283203, "learning_rate": 2.666858387034654e-07, "loss": 0.6453, "step": 4454 }, { "epoch": 0.8987319029021551, "grad_norm": 0.44681617617607117, "learning_rate": 2.656339036298522e-07, "loss": 0.6852, "step": 4455 }, { "epoch": 0.898933638458362, "grad_norm": 1.701403021812439, "learning_rate": 2.6458399070603047e-07, "loss": 0.6536, "step": 4456 }, { "epoch": 0.8991353740145691, "grad_norm": 0.5173128843307495, "learning_rate": 2.635361003804443e-07, "loss": 0.6447, "step": 4457 }, { "epoch": 0.8993371095707761, "grad_norm": 1.2008379697799683, "learning_rate": 2.6249023310066845e-07, "loss": 0.7079, "step": 4458 }, { "epoch": 0.8995388451269831, "grad_norm": 0.9323862195014954, "learning_rate": 2.6144638931341503e-07, "loss": 0.6737, "step": 4459 }, { "epoch": 0.8997405806831901, "grad_norm": 0.3327551484107971, "learning_rate": 2.604045694645341e-07, "loss": 0.6379, "step": 4460 }, { "epoch": 0.899942316239397, "grad_norm": 0.4596746265888214, "learning_rate": 2.593647739990068e-07, "loss": 0.6822, "step": 4461 }, { "epoch": 0.9001440517956041, "grad_norm": 0.6696921586990356, "learning_rate": 2.583270033609536e-07, "loss": 0.687, "step": 4462 }, { "epoch": 0.900345787351811, "grad_norm": 0.48210087418556213, "learning_rate": 2.572912579936304e-07, "loss": 0.7884, "step": 4463 }, { "epoch": 0.9005475229080181, "grad_norm": 0.8259105086326599, "learning_rate": 2.5625753833942337e-07, "loss": 0.78, "step": 4464 }, { "epoch": 0.900749258464225, "grad_norm": 0.40220457315444946, "learning_rate": 2.552258448398576e-07, "loss": 0.6503, "step": 4465 }, { "epoch": 0.900950994020432, "grad_norm": 0.4821997880935669, "learning_rate": 2.5419617793559224e-07, "loss": 0.6525, "step": 4466 }, { "epoch": 0.901152729576639, "grad_norm": 1.67778480052948, "learning_rate": 2.5316853806641895e-07, "loss": 0.6351, "step": 4467 }, { "epoch": 0.901354465132846, "grad_norm": 0.44449669122695923, "learning_rate": 2.521429256712665e-07, "loss": 0.6526, "step": 4468 }, { "epoch": 0.901556200689053, "grad_norm": 0.5264053344726562, "learning_rate": 2.5111934118819514e-07, "loss": 0.7608, "step": 4469 }, { "epoch": 0.90175793624526, "grad_norm": 0.47850438952445984, "learning_rate": 2.5009778505439895e-07, "loss": 0.6309, "step": 4470 }, { "epoch": 0.901959671801467, "grad_norm": 0.8120545744895935, "learning_rate": 2.49078257706209e-07, "loss": 0.6336, "step": 4471 }, { "epoch": 0.902161407357674, "grad_norm": 0.4928573966026306, "learning_rate": 2.480607595790846e-07, "loss": 0.7334, "step": 4472 }, { "epoch": 0.902363142913881, "grad_norm": 0.582403302192688, "learning_rate": 2.470452911076227e-07, "loss": 0.6507, "step": 4473 }, { "epoch": 0.9025648784700879, "grad_norm": 0.5206587910652161, "learning_rate": 2.460318527255523e-07, "loss": 0.6185, "step": 4474 }, { "epoch": 0.902766614026295, "grad_norm": 1.0327239036560059, "learning_rate": 2.450204448657328e-07, "loss": 0.6549, "step": 4475 }, { "epoch": 0.9029683495825019, "grad_norm": 0.4468756914138794, "learning_rate": 2.4401106796016037e-07, "loss": 0.6491, "step": 4476 }, { "epoch": 0.903170085138709, "grad_norm": 0.35880833864212036, "learning_rate": 2.430037224399606e-07, "loss": 0.6615, "step": 4477 }, { "epoch": 0.903371820694916, "grad_norm": 0.6757314801216125, "learning_rate": 2.4199840873539217e-07, "loss": 0.6449, "step": 4478 }, { "epoch": 0.9035735562511229, "grad_norm": 0.9163317084312439, "learning_rate": 2.409951272758471e-07, "loss": 0.6517, "step": 4479 }, { "epoch": 0.90377529180733, "grad_norm": 0.4744824171066284, "learning_rate": 2.399938784898481e-07, "loss": 0.6863, "step": 4480 }, { "epoch": 0.9039770273635369, "grad_norm": 0.36794766783714294, "learning_rate": 2.3899466280504936e-07, "loss": 0.6666, "step": 4481 }, { "epoch": 0.904178762919744, "grad_norm": 0.6534149646759033, "learning_rate": 2.3799748064823935e-07, "loss": 0.6995, "step": 4482 }, { "epoch": 0.9043804984759509, "grad_norm": 0.3607742190361023, "learning_rate": 2.3700233244533412e-07, "loss": 0.6806, "step": 4483 }, { "epoch": 0.9045822340321579, "grad_norm": 0.4232555627822876, "learning_rate": 2.3600921862138414e-07, "loss": 0.6792, "step": 4484 }, { "epoch": 0.9047839695883649, "grad_norm": 0.6024295687675476, "learning_rate": 2.3501813960056962e-07, "loss": 0.6654, "step": 4485 }, { "epoch": 0.9049857051445719, "grad_norm": 0.6922175288200378, "learning_rate": 2.3402909580620025e-07, "loss": 0.7664, "step": 4486 }, { "epoch": 0.9051874407007788, "grad_norm": 0.8140506148338318, "learning_rate": 2.330420876607198e-07, "loss": 0.7117, "step": 4487 }, { "epoch": 0.9053891762569859, "grad_norm": 0.7923047542572021, "learning_rate": 2.3205711558570043e-07, "loss": 0.6498, "step": 4488 }, { "epoch": 0.9055909118131928, "grad_norm": 0.9481064677238464, "learning_rate": 2.3107418000184345e-07, "loss": 0.668, "step": 4489 }, { "epoch": 0.9057926473693999, "grad_norm": 0.44208043813705444, "learning_rate": 2.3009328132898355e-07, "loss": 0.8772, "step": 4490 }, { "epoch": 0.9059943829256069, "grad_norm": 0.6147819757461548, "learning_rate": 2.2911441998608342e-07, "loss": 0.6579, "step": 4491 }, { "epoch": 0.9061961184818138, "grad_norm": 0.39272868633270264, "learning_rate": 2.2813759639123577e-07, "loss": 0.6526, "step": 4492 }, { "epoch": 0.9063978540380209, "grad_norm": 0.5846182703971863, "learning_rate": 2.2716281096166137e-07, "loss": 0.8201, "step": 4493 }, { "epoch": 0.9065995895942278, "grad_norm": 0.3155399262905121, "learning_rate": 2.2619006411371437e-07, "loss": 0.6852, "step": 4494 }, { "epoch": 0.9068013251504349, "grad_norm": 0.38381192088127136, "learning_rate": 2.2521935626287516e-07, "loss": 0.7007, "step": 4495 }, { "epoch": 0.9070030607066418, "grad_norm": 0.6401692628860474, "learning_rate": 2.242506878237538e-07, "loss": 0.6858, "step": 4496 }, { "epoch": 0.9072047962628488, "grad_norm": 0.540847659111023, "learning_rate": 2.2328405921008877e-07, "loss": 0.7831, "step": 4497 }, { "epoch": 0.9074065318190558, "grad_norm": 0.4094178080558777, "learning_rate": 2.2231947083474925e-07, "loss": 0.765, "step": 4498 }, { "epoch": 0.9076082673752628, "grad_norm": 0.38613298535346985, "learning_rate": 2.213569231097312e-07, "loss": 0.7178, "step": 4499 }, { "epoch": 0.9078100029314699, "grad_norm": 1.2126609086990356, "learning_rate": 2.203964164461597e-07, "loss": 0.6322, "step": 4500 }, { "epoch": 0.9080117384876768, "grad_norm": 0.5331121683120728, "learning_rate": 2.1943795125428659e-07, "loss": 0.7294, "step": 4501 }, { "epoch": 0.9082134740438838, "grad_norm": 0.3965674042701721, "learning_rate": 2.1848152794349487e-07, "loss": 0.6751, "step": 4502 }, { "epoch": 0.9084152096000908, "grad_norm": 0.3532261550426483, "learning_rate": 2.1752714692229282e-07, "loss": 0.8164, "step": 4503 }, { "epoch": 0.9086169451562978, "grad_norm": 1.104665756225586, "learning_rate": 2.1657480859831603e-07, "loss": 0.6374, "step": 4504 }, { "epoch": 0.9088186807125048, "grad_norm": 0.8871171474456787, "learning_rate": 2.156245133783308e-07, "loss": 0.7202, "step": 4505 }, { "epoch": 0.9090204162687118, "grad_norm": 0.5117438435554504, "learning_rate": 2.1467626166822742e-07, "loss": 0.8585, "step": 4506 }, { "epoch": 0.9092221518249187, "grad_norm": 0.5775021910667419, "learning_rate": 2.1373005387302416e-07, "loss": 0.7866, "step": 4507 }, { "epoch": 0.9094238873811258, "grad_norm": 0.6503050923347473, "learning_rate": 2.127858903968677e-07, "loss": 0.6747, "step": 4508 }, { "epoch": 0.9096256229373327, "grad_norm": 0.4406130313873291, "learning_rate": 2.1184377164303106e-07, "loss": 0.656, "step": 4509 }, { "epoch": 0.9098273584935397, "grad_norm": 0.41005194187164307, "learning_rate": 2.1090369801391231e-07, "loss": 0.7006, "step": 4510 }, { "epoch": 0.9100290940497467, "grad_norm": 0.38209623098373413, "learning_rate": 2.0996566991103752e-07, "loss": 0.6916, "step": 4511 }, { "epoch": 0.9102308296059537, "grad_norm": 0.49952998757362366, "learning_rate": 2.0902968773505838e-07, "loss": 0.7505, "step": 4512 }, { "epoch": 0.9104325651621608, "grad_norm": 0.6335734724998474, "learning_rate": 2.0809575188575404e-07, "loss": 0.6507, "step": 4513 }, { "epoch": 0.9106343007183677, "grad_norm": 0.46581634879112244, "learning_rate": 2.0716386276202815e-07, "loss": 0.6842, "step": 4514 }, { "epoch": 0.9108360362745747, "grad_norm": 0.437761515378952, "learning_rate": 2.0623402076190956e-07, "loss": 0.6598, "step": 4515 }, { "epoch": 0.9110377718307817, "grad_norm": 0.4070280194282532, "learning_rate": 2.0530622628255613e-07, "loss": 0.658, "step": 4516 }, { "epoch": 0.9112395073869887, "grad_norm": 0.4778706431388855, "learning_rate": 2.04380479720247e-07, "loss": 0.7122, "step": 4517 }, { "epoch": 0.9114412429431957, "grad_norm": 0.5471115708351135, "learning_rate": 2.0345678147038807e-07, "loss": 0.6866, "step": 4518 }, { "epoch": 0.9116429784994027, "grad_norm": 1.0620917081832886, "learning_rate": 2.0253513192751374e-07, "loss": 0.6739, "step": 4519 }, { "epoch": 0.9118447140556096, "grad_norm": 1.0109601020812988, "learning_rate": 2.0161553148527692e-07, "loss": 0.7746, "step": 4520 }, { "epoch": 0.9120464496118167, "grad_norm": 0.51026850938797, "learning_rate": 2.0069798053646005e-07, "loss": 0.6865, "step": 4521 }, { "epoch": 0.9122481851680236, "grad_norm": 0.48596811294555664, "learning_rate": 1.9978247947297025e-07, "loss": 0.7223, "step": 4522 }, { "epoch": 0.9124499207242307, "grad_norm": 1.3675061464309692, "learning_rate": 1.9886902868583525e-07, "loss": 0.6901, "step": 4523 }, { "epoch": 0.9126516562804377, "grad_norm": 0.43187323212623596, "learning_rate": 1.9795762856521183e-07, "loss": 0.7347, "step": 4524 }, { "epoch": 0.9128533918366446, "grad_norm": 0.5734922885894775, "learning_rate": 1.9704827950037753e-07, "loss": 0.6259, "step": 4525 }, { "epoch": 0.9130551273928517, "grad_norm": 0.36623337864875793, "learning_rate": 1.9614098187973495e-07, "loss": 0.6741, "step": 4526 }, { "epoch": 0.9132568629490586, "grad_norm": 1.3819425106048584, "learning_rate": 1.9523573609081137e-07, "loss": 0.7965, "step": 4527 }, { "epoch": 0.9134585985052656, "grad_norm": 0.6897246837615967, "learning_rate": 1.9433254252025524e-07, "loss": 0.7002, "step": 4528 }, { "epoch": 0.9136603340614726, "grad_norm": 0.500214159488678, "learning_rate": 1.9343140155384023e-07, "loss": 0.9337, "step": 4529 }, { "epoch": 0.9138620696176796, "grad_norm": 0.33126023411750793, "learning_rate": 1.9253231357646507e-07, "loss": 0.6866, "step": 4530 }, { "epoch": 0.9140638051738866, "grad_norm": 0.3791239261627197, "learning_rate": 1.9163527897214706e-07, "loss": 0.6401, "step": 4531 }, { "epoch": 0.9142655407300936, "grad_norm": 0.8828220963478088, "learning_rate": 1.9074029812403084e-07, "loss": 0.64, "step": 4532 }, { "epoch": 0.9144672762863005, "grad_norm": 0.3454236388206482, "learning_rate": 1.8984737141438113e-07, "loss": 0.6586, "step": 4533 }, { "epoch": 0.9146690118425076, "grad_norm": 0.4951605796813965, "learning_rate": 1.889564992245857e-07, "loss": 0.6759, "step": 4534 }, { "epoch": 0.9148707473987145, "grad_norm": 1.244110345840454, "learning_rate": 1.880676819351568e-07, "loss": 0.6697, "step": 4535 }, { "epoch": 0.9150724829549216, "grad_norm": 0.743262529373169, "learning_rate": 1.871809199257263e-07, "loss": 0.8218, "step": 4536 }, { "epoch": 0.9152742185111286, "grad_norm": 0.8847272396087646, "learning_rate": 1.8629621357504902e-07, "loss": 0.6451, "step": 4537 }, { "epoch": 0.9154759540673355, "grad_norm": 0.793997585773468, "learning_rate": 1.8541356326100436e-07, "loss": 0.7275, "step": 4538 }, { "epoch": 0.9156776896235426, "grad_norm": 0.8958233594894409, "learning_rate": 1.8453296936058796e-07, "loss": 0.6335, "step": 4539 }, { "epoch": 0.9158794251797495, "grad_norm": 0.5523783564567566, "learning_rate": 1.8365443224992286e-07, "loss": 0.6393, "step": 4540 }, { "epoch": 0.9160811607359566, "grad_norm": 0.3359818458557129, "learning_rate": 1.8277795230425054e-07, "loss": 0.7353, "step": 4541 }, { "epoch": 0.9162828962921635, "grad_norm": 0.5899154543876648, "learning_rate": 1.8190352989793325e-07, "loss": 0.7054, "step": 4542 }, { "epoch": 0.9164846318483705, "grad_norm": 0.4736073911190033, "learning_rate": 1.810311654044583e-07, "loss": 0.7244, "step": 4543 }, { "epoch": 0.9166863674045775, "grad_norm": 0.6302992701530457, "learning_rate": 1.8016085919642934e-07, "loss": 0.6488, "step": 4544 }, { "epoch": 0.9168881029607845, "grad_norm": 0.7711555361747742, "learning_rate": 1.7929261164557287e-07, "loss": 0.7604, "step": 4545 }, { "epoch": 0.9170898385169914, "grad_norm": 1.415159821510315, "learning_rate": 1.7842642312273728e-07, "loss": 0.7046, "step": 4546 }, { "epoch": 0.9172915740731985, "grad_norm": 0.38230201601982117, "learning_rate": 1.7756229399788993e-07, "loss": 0.7197, "step": 4547 }, { "epoch": 0.9174933096294055, "grad_norm": 0.4994561970233917, "learning_rate": 1.7670022464011837e-07, "loss": 0.8675, "step": 4548 }, { "epoch": 0.9176950451856125, "grad_norm": 0.6563410758972168, "learning_rate": 1.758402154176314e-07, "loss": 1.1117, "step": 4549 }, { "epoch": 0.9178967807418195, "grad_norm": 0.4620409905910492, "learning_rate": 1.7498226669775854e-07, "loss": 0.643, "step": 4550 }, { "epoch": 0.9180985162980264, "grad_norm": 0.37006834149360657, "learning_rate": 1.741263788469466e-07, "loss": 0.684, "step": 4551 }, { "epoch": 0.9183002518542335, "grad_norm": 0.32972970604896545, "learning_rate": 1.7327255223076434e-07, "loss": 0.7611, "step": 4552 }, { "epoch": 0.9185019874104404, "grad_norm": 0.5621365308761597, "learning_rate": 1.7242078721389888e-07, "loss": 0.7297, "step": 4553 }, { "epoch": 0.9187037229666475, "grad_norm": 0.43346965312957764, "learning_rate": 1.7157108416015867e-07, "loss": 0.6915, "step": 4554 }, { "epoch": 0.9189054585228544, "grad_norm": 2.2155392169952393, "learning_rate": 1.7072344343246948e-07, "loss": 0.7675, "step": 4555 }, { "epoch": 0.9191071940790614, "grad_norm": 0.7277094721794128, "learning_rate": 1.6987786539287677e-07, "loss": 0.642, "step": 4556 }, { "epoch": 0.9193089296352684, "grad_norm": 0.3713609576225281, "learning_rate": 1.6903435040254545e-07, "loss": 0.7663, "step": 4557 }, { "epoch": 0.9195106651914754, "grad_norm": 0.746296226978302, "learning_rate": 1.681928988217596e-07, "loss": 0.6506, "step": 4558 }, { "epoch": 0.9197124007476825, "grad_norm": 0.3724304735660553, "learning_rate": 1.6735351100992003e-07, "loss": 0.6939, "step": 4559 }, { "epoch": 0.9199141363038894, "grad_norm": 0.5999610424041748, "learning_rate": 1.6651618732554774e-07, "loss": 0.6673, "step": 4560 }, { "epoch": 0.9201158718600964, "grad_norm": 0.8278008103370667, "learning_rate": 1.6568092812628223e-07, "loss": 0.7195, "step": 4561 }, { "epoch": 0.9203176074163034, "grad_norm": 0.7241888046264648, "learning_rate": 1.648477337688803e-07, "loss": 0.6596, "step": 4562 }, { "epoch": 0.9205193429725104, "grad_norm": 0.49117833375930786, "learning_rate": 1.6401660460921675e-07, "loss": 0.6889, "step": 4563 }, { "epoch": 0.9207210785287173, "grad_norm": 0.990657389163971, "learning_rate": 1.631875410022865e-07, "loss": 0.7313, "step": 4564 }, { "epoch": 0.9209228140849244, "grad_norm": 0.3560831546783447, "learning_rate": 1.6236054330219853e-07, "loss": 0.7822, "step": 4565 }, { "epoch": 0.9211245496411313, "grad_norm": 0.41908150911331177, "learning_rate": 1.6153561186218247e-07, "loss": 0.7748, "step": 4566 }, { "epoch": 0.9213262851973384, "grad_norm": 0.5297964811325073, "learning_rate": 1.6071274703458428e-07, "loss": 0.6574, "step": 4567 }, { "epoch": 0.9215280207535453, "grad_norm": 0.6379546523094177, "learning_rate": 1.5989194917086615e-07, "loss": 0.6792, "step": 4568 }, { "epoch": 0.9217297563097523, "grad_norm": 0.8528479337692261, "learning_rate": 1.5907321862160985e-07, "loss": 0.6961, "step": 4569 }, { "epoch": 0.9219314918659594, "grad_norm": 0.396762490272522, "learning_rate": 1.582565557365129e-07, "loss": 0.813, "step": 4570 }, { "epoch": 0.9221332274221663, "grad_norm": 0.3264789879322052, "learning_rate": 1.5744196086438789e-07, "loss": 0.7792, "step": 4571 }, { "epoch": 0.9223349629783734, "grad_norm": 0.42081427574157715, "learning_rate": 1.566294343531677e-07, "loss": 0.6821, "step": 4572 }, { "epoch": 0.9225366985345803, "grad_norm": 0.5739784240722656, "learning_rate": 1.5581897654989963e-07, "loss": 0.6688, "step": 4573 }, { "epoch": 0.9227384340907873, "grad_norm": 0.38827821612358093, "learning_rate": 1.5501058780074685e-07, "loss": 0.6339, "step": 4574 }, { "epoch": 0.9229401696469943, "grad_norm": 0.8106147646903992, "learning_rate": 1.5420426845099035e-07, "loss": 0.6988, "step": 4575 }, { "epoch": 0.9231419052032013, "grad_norm": 0.46809065341949463, "learning_rate": 1.5340001884502577e-07, "loss": 0.7949, "step": 4576 }, { "epoch": 0.9233436407594083, "grad_norm": 0.3986304998397827, "learning_rate": 1.5259783932636608e-07, "loss": 0.8216, "step": 4577 }, { "epoch": 0.9235453763156153, "grad_norm": 0.4268011748790741, "learning_rate": 1.5179773023763998e-07, "loss": 0.6838, "step": 4578 }, { "epoch": 0.9237471118718222, "grad_norm": 0.6723108887672424, "learning_rate": 1.5099969192058972e-07, "loss": 0.6683, "step": 4579 }, { "epoch": 0.9239488474280293, "grad_norm": 4.252002716064453, "learning_rate": 1.5020372471607593e-07, "loss": 0.742, "step": 4580 }, { "epoch": 0.9241505829842362, "grad_norm": 1.975690245628357, "learning_rate": 1.4940982896407275e-07, "loss": 0.8189, "step": 4581 }, { "epoch": 0.9243523185404432, "grad_norm": 0.6956148743629456, "learning_rate": 1.4861800500367007e-07, "loss": 0.7083, "step": 4582 }, { "epoch": 0.9245540540966503, "grad_norm": 0.56894850730896, "learning_rate": 1.4782825317307348e-07, "loss": 0.6524, "step": 4583 }, { "epoch": 0.9247557896528572, "grad_norm": 0.3585595190525055, "learning_rate": 1.4704057380960313e-07, "loss": 0.665, "step": 4584 }, { "epoch": 0.9249575252090643, "grad_norm": 0.48725804686546326, "learning_rate": 1.4625496724969324e-07, "loss": 0.6495, "step": 4585 }, { "epoch": 0.9251592607652712, "grad_norm": 0.9633391499519348, "learning_rate": 1.454714338288943e-07, "loss": 0.7812, "step": 4586 }, { "epoch": 0.9253609963214782, "grad_norm": 0.3100288212299347, "learning_rate": 1.4468997388186857e-07, "loss": 0.6137, "step": 4587 }, { "epoch": 0.9255627318776852, "grad_norm": 0.49807772040367126, "learning_rate": 1.439105877423963e-07, "loss": 0.7728, "step": 4588 }, { "epoch": 0.9257644674338922, "grad_norm": 0.6176745295524597, "learning_rate": 1.4313327574336899e-07, "loss": 0.658, "step": 4589 }, { "epoch": 0.9259662029900992, "grad_norm": 0.7888067960739136, "learning_rate": 1.4235803821679328e-07, "loss": 0.6395, "step": 4590 }, { "epoch": 0.9261679385463062, "grad_norm": 0.3679797649383545, "learning_rate": 1.415848754937904e-07, "loss": 0.8599, "step": 4591 }, { "epoch": 0.9263696741025131, "grad_norm": 0.7646364569664001, "learning_rate": 1.408137879045951e-07, "loss": 0.708, "step": 4592 }, { "epoch": 0.9265714096587202, "grad_norm": 0.5600611567497253, "learning_rate": 1.4004477577855392e-07, "loss": 0.6493, "step": 4593 }, { "epoch": 0.9267731452149272, "grad_norm": 0.762845516204834, "learning_rate": 1.3927783944413075e-07, "loss": 0.6445, "step": 4594 }, { "epoch": 0.9269748807711342, "grad_norm": 0.32098323106765747, "learning_rate": 1.385129792288986e-07, "loss": 0.6962, "step": 4595 }, { "epoch": 0.9271766163273412, "grad_norm": 0.379790723323822, "learning_rate": 1.377501954595467e-07, "loss": 0.727, "step": 4596 }, { "epoch": 0.9273783518835481, "grad_norm": 0.7628555297851562, "learning_rate": 1.369894884618772e-07, "loss": 0.7543, "step": 4597 }, { "epoch": 0.9275800874397552, "grad_norm": 0.4495057761669159, "learning_rate": 1.3623085856080298e-07, "loss": 0.6668, "step": 4598 }, { "epoch": 0.9277818229959621, "grad_norm": 0.4083491563796997, "learning_rate": 1.3547430608035207e-07, "loss": 0.6652, "step": 4599 }, { "epoch": 0.9279835585521692, "grad_norm": 0.49711883068084717, "learning_rate": 1.3471983134366374e-07, "loss": 0.6794, "step": 4600 }, { "epoch": 0.9281852941083761, "grad_norm": 0.6337301135063171, "learning_rate": 1.3396743467299077e-07, "loss": 0.6634, "step": 4601 }, { "epoch": 0.9283870296645831, "grad_norm": 0.6049329042434692, "learning_rate": 1.3321711638969836e-07, "loss": 0.6498, "step": 4602 }, { "epoch": 0.9285887652207901, "grad_norm": 0.3812504708766937, "learning_rate": 1.3246887681426346e-07, "loss": 0.6642, "step": 4603 }, { "epoch": 0.9287905007769971, "grad_norm": 0.41788598895072937, "learning_rate": 1.3172271626627486e-07, "loss": 0.634, "step": 4604 }, { "epoch": 0.928992236333204, "grad_norm": 0.5177899599075317, "learning_rate": 1.3097863506443432e-07, "loss": 0.6881, "step": 4605 }, { "epoch": 0.9291939718894111, "grad_norm": 0.4392205774784088, "learning_rate": 1.3023663352655424e-07, "loss": 0.6254, "step": 4606 }, { "epoch": 0.9293957074456181, "grad_norm": 0.3761579692363739, "learning_rate": 1.294967119695606e-07, "loss": 0.7679, "step": 4607 }, { "epoch": 0.9295974430018251, "grad_norm": 0.47355636954307556, "learning_rate": 1.287588707094889e-07, "loss": 0.6452, "step": 4608 }, { "epoch": 0.9297991785580321, "grad_norm": 0.40907222032546997, "learning_rate": 1.2802311006148703e-07, "loss": 0.8158, "step": 4609 }, { "epoch": 0.930000914114239, "grad_norm": 0.4282612204551697, "learning_rate": 1.272894303398148e-07, "loss": 0.7309, "step": 4610 }, { "epoch": 0.9302026496704461, "grad_norm": 0.41782447695732117, "learning_rate": 1.2655783185784253e-07, "loss": 0.6939, "step": 4611 }, { "epoch": 0.930404385226653, "grad_norm": 0.4852448105812073, "learning_rate": 1.2582831492805092e-07, "loss": 0.6524, "step": 4612 }, { "epoch": 0.9306061207828601, "grad_norm": 0.3634573817253113, "learning_rate": 1.2510087986203346e-07, "loss": 0.6512, "step": 4613 }, { "epoch": 0.930807856339067, "grad_norm": 0.5622749924659729, "learning_rate": 1.2437552697049327e-07, "loss": 0.8715, "step": 4614 }, { "epoch": 0.931009591895274, "grad_norm": 0.4542370140552521, "learning_rate": 1.2365225656324308e-07, "loss": 0.6442, "step": 4615 }, { "epoch": 0.931211327451481, "grad_norm": 0.36814215779304504, "learning_rate": 1.2293106894920803e-07, "loss": 0.6481, "step": 4616 }, { "epoch": 0.931413063007688, "grad_norm": 0.3866225481033325, "learning_rate": 1.2221196443642336e-07, "loss": 0.6577, "step": 4617 }, { "epoch": 0.9316147985638951, "grad_norm": 0.42240482568740845, "learning_rate": 1.214949433320334e-07, "loss": 0.8365, "step": 4618 }, { "epoch": 0.931816534120102, "grad_norm": 0.7792031168937683, "learning_rate": 1.2078000594229312e-07, "loss": 0.6615, "step": 4619 }, { "epoch": 0.932018269676309, "grad_norm": 0.36705565452575684, "learning_rate": 1.200671525725683e-07, "loss": 0.6638, "step": 4620 }, { "epoch": 0.932220005232516, "grad_norm": 0.4371699094772339, "learning_rate": 1.1935638352733424e-07, "loss": 0.7768, "step": 4621 }, { "epoch": 0.932421740788723, "grad_norm": 0.3500683605670929, "learning_rate": 1.1864769911017482e-07, "loss": 0.703, "step": 4622 }, { "epoch": 0.9326234763449299, "grad_norm": 0.6728479266166687, "learning_rate": 1.1794109962378452e-07, "loss": 0.6745, "step": 4623 }, { "epoch": 0.932825211901137, "grad_norm": 0.349743515253067, "learning_rate": 1.1723658536996807e-07, "loss": 0.6736, "step": 4624 }, { "epoch": 0.9330269474573439, "grad_norm": 0.9413335919380188, "learning_rate": 1.1653415664963807e-07, "loss": 0.7034, "step": 4625 }, { "epoch": 0.933228683013551, "grad_norm": 1.0594662427902222, "learning_rate": 1.1583381376281733e-07, "loss": 0.7869, "step": 4626 }, { "epoch": 0.933430418569758, "grad_norm": 0.5527055859565735, "learning_rate": 1.1513555700863655e-07, "loss": 0.6817, "step": 4627 }, { "epoch": 0.9336321541259649, "grad_norm": 1.049141526222229, "learning_rate": 1.1443938668533716e-07, "loss": 0.8054, "step": 4628 }, { "epoch": 0.933833889682172, "grad_norm": 0.5355727672576904, "learning_rate": 1.1374530309026799e-07, "loss": 0.6708, "step": 4629 }, { "epoch": 0.9340356252383789, "grad_norm": 0.48663362860679626, "learning_rate": 1.1305330651988689e-07, "loss": 0.6783, "step": 4630 }, { "epoch": 0.934237360794586, "grad_norm": 0.4144569933414459, "learning_rate": 1.1236339726976132e-07, "loss": 0.749, "step": 4631 }, { "epoch": 0.9344390963507929, "grad_norm": 0.41263678669929504, "learning_rate": 1.1167557563456611e-07, "loss": 0.657, "step": 4632 }, { "epoch": 0.9346408319069999, "grad_norm": 1.1964731216430664, "learning_rate": 1.1098984190808403e-07, "loss": 0.7634, "step": 4633 }, { "epoch": 0.9348425674632069, "grad_norm": 0.40326234698295593, "learning_rate": 1.1030619638320805e-07, "loss": 0.7715, "step": 4634 }, { "epoch": 0.9350443030194139, "grad_norm": 0.3794019818305969, "learning_rate": 1.0962463935193624e-07, "loss": 0.6335, "step": 4635 }, { "epoch": 0.9352460385756209, "grad_norm": 0.4727846682071686, "learning_rate": 1.089451711053774e-07, "loss": 0.8265, "step": 4636 }, { "epoch": 0.9354477741318279, "grad_norm": 0.5812278389930725, "learning_rate": 1.0826779193374715e-07, "loss": 0.7067, "step": 4637 }, { "epoch": 0.9356495096880348, "grad_norm": 0.360893577337265, "learning_rate": 1.0759250212636795e-07, "loss": 0.6921, "step": 4638 }, { "epoch": 0.9358512452442419, "grad_norm": 0.5948578715324402, "learning_rate": 1.0691930197167133e-07, "loss": 0.746, "step": 4639 }, { "epoch": 0.9360529808004489, "grad_norm": 0.8942117094993591, "learning_rate": 1.0624819175719558e-07, "loss": 0.6549, "step": 4640 }, { "epoch": 0.9362547163566558, "grad_norm": 0.9222234487533569, "learning_rate": 1.0557917176958532e-07, "loss": 0.7215, "step": 4641 }, { "epoch": 0.9364564519128629, "grad_norm": 0.9265431761741638, "learning_rate": 1.0491224229459529e-07, "loss": 0.8325, "step": 4642 }, { "epoch": 0.9366581874690698, "grad_norm": 0.3459104299545288, "learning_rate": 1.0424740361708374e-07, "loss": 0.6419, "step": 4643 }, { "epoch": 0.9368599230252769, "grad_norm": 0.521603524684906, "learning_rate": 1.0358465602101796e-07, "loss": 0.6269, "step": 4644 }, { "epoch": 0.9370616585814838, "grad_norm": 0.6259787678718567, "learning_rate": 1.0292399978947265e-07, "loss": 0.715, "step": 4645 }, { "epoch": 0.9372633941376908, "grad_norm": 0.6800326108932495, "learning_rate": 1.0226543520462707e-07, "loss": 0.6915, "step": 4646 }, { "epoch": 0.9374651296938978, "grad_norm": 0.47363513708114624, "learning_rate": 1.0160896254776897e-07, "loss": 0.6676, "step": 4647 }, { "epoch": 0.9376668652501048, "grad_norm": 0.6539434790611267, "learning_rate": 1.0095458209929243e-07, "loss": 0.657, "step": 4648 }, { "epoch": 0.9378686008063118, "grad_norm": 0.4961993992328644, "learning_rate": 1.0030229413869607e-07, "loss": 0.671, "step": 4649 }, { "epoch": 0.9380703363625188, "grad_norm": 0.4845615327358246, "learning_rate": 9.965209894458761e-08, "loss": 0.624, "step": 4650 }, { "epoch": 0.9382720719187257, "grad_norm": 0.37729620933532715, "learning_rate": 9.900399679467876e-08, "loss": 0.6853, "step": 4651 }, { "epoch": 0.9384738074749328, "grad_norm": 0.41500183939933777, "learning_rate": 9.835798796578755e-08, "loss": 0.6308, "step": 4652 }, { "epoch": 0.9386755430311398, "grad_norm": 1.0571080446243286, "learning_rate": 9.771407273383938e-08, "loss": 0.6582, "step": 4653 }, { "epoch": 0.9388772785873468, "grad_norm": 0.31755977869033813, "learning_rate": 9.707225137386256e-08, "loss": 0.6526, "step": 4654 }, { "epoch": 0.9390790141435538, "grad_norm": 0.3515841066837311, "learning_rate": 9.643252415999504e-08, "loss": 0.7137, "step": 4655 }, { "epoch": 0.9392807496997607, "grad_norm": 1.5465117692947388, "learning_rate": 9.579489136547659e-08, "loss": 0.6578, "step": 4656 }, { "epoch": 0.9394824852559678, "grad_norm": 1.811485767364502, "learning_rate": 9.51593532626538e-08, "loss": 0.6434, "step": 4657 }, { "epoch": 0.9396842208121747, "grad_norm": 0.4483201205730438, "learning_rate": 9.452591012297951e-08, "loss": 0.6697, "step": 4658 }, { "epoch": 0.9398859563683817, "grad_norm": 0.3444378077983856, "learning_rate": 9.389456221701121e-08, "loss": 0.7016, "step": 4659 }, { "epoch": 0.9400876919245887, "grad_norm": 0.3389723300933838, "learning_rate": 9.326530981440985e-08, "loss": 0.7512, "step": 4660 }, { "epoch": 0.9402894274807957, "grad_norm": 0.45492416620254517, "learning_rate": 9.263815318394376e-08, "loss": 0.6854, "step": 4661 }, { "epoch": 0.9404911630370028, "grad_norm": 0.609420120716095, "learning_rate": 9.201309259348479e-08, "loss": 0.7446, "step": 4662 }, { "epoch": 0.9406928985932097, "grad_norm": 0.41878068447113037, "learning_rate": 9.139012831000937e-08, "loss": 0.6859, "step": 4663 }, { "epoch": 0.9408946341494167, "grad_norm": 1.0418484210968018, "learning_rate": 9.076926059959967e-08, "loss": 0.6834, "step": 4664 }, { "epoch": 0.9410963697056237, "grad_norm": 0.4487755298614502, "learning_rate": 9.015048972744079e-08, "loss": 0.6682, "step": 4665 }, { "epoch": 0.9412981052618307, "grad_norm": 0.46032407879829407, "learning_rate": 8.953381595782462e-08, "loss": 0.6496, "step": 4666 }, { "epoch": 0.9414998408180377, "grad_norm": 0.3330506384372711, "learning_rate": 8.891923955414438e-08, "loss": 0.7225, "step": 4667 }, { "epoch": 0.9417015763742447, "grad_norm": 0.36100640892982483, "learning_rate": 8.8306760778899e-08, "loss": 0.6906, "step": 4668 }, { "epoch": 0.9419033119304516, "grad_norm": 0.5503403544425964, "learning_rate": 8.769637989369195e-08, "loss": 0.721, "step": 4669 }, { "epoch": 0.9421050474866587, "grad_norm": 0.49627622961997986, "learning_rate": 8.708809715922973e-08, "loss": 0.7463, "step": 4670 }, { "epoch": 0.9423067830428656, "grad_norm": 0.4225630760192871, "learning_rate": 8.648191283532337e-08, "loss": 0.6516, "step": 4671 }, { "epoch": 0.9425085185990727, "grad_norm": 1.131819248199463, "learning_rate": 8.587782718088688e-08, "loss": 0.6205, "step": 4672 }, { "epoch": 0.9427102541552796, "grad_norm": 0.6810362339019775, "learning_rate": 8.527584045393833e-08, "loss": 0.6979, "step": 4673 }, { "epoch": 0.9429119897114866, "grad_norm": 0.7575717568397522, "learning_rate": 8.46759529115998e-08, "loss": 0.733, "step": 4674 }, { "epoch": 0.9431137252676937, "grad_norm": 0.31936702132225037, "learning_rate": 8.407816481009524e-08, "loss": 0.7006, "step": 4675 }, { "epoch": 0.9433154608239006, "grad_norm": 0.4622681140899658, "learning_rate": 8.34824764047526e-08, "loss": 0.7778, "step": 4676 }, { "epoch": 0.9435171963801076, "grad_norm": 0.4197992980480194, "learning_rate": 8.288888795000504e-08, "loss": 0.7495, "step": 4677 }, { "epoch": 0.9437189319363146, "grad_norm": 0.5172026753425598, "learning_rate": 8.229739969938533e-08, "loss": 0.6222, "step": 4678 }, { "epoch": 0.9439206674925216, "grad_norm": 0.444822758436203, "learning_rate": 8.17080119055308e-08, "loss": 0.674, "step": 4679 }, { "epoch": 0.9441224030487286, "grad_norm": 0.35563191771507263, "learning_rate": 8.11207248201834e-08, "loss": 0.719, "step": 4680 }, { "epoch": 0.9443241386049356, "grad_norm": 0.557724118232727, "learning_rate": 8.053553869418418e-08, "loss": 0.8011, "step": 4681 }, { "epoch": 0.9445258741611425, "grad_norm": 0.516588568687439, "learning_rate": 7.995245377747984e-08, "loss": 0.6691, "step": 4682 }, { "epoch": 0.9447276097173496, "grad_norm": 1.4332866668701172, "learning_rate": 7.937147031911785e-08, "loss": 0.6962, "step": 4683 }, { "epoch": 0.9449293452735565, "grad_norm": 0.4007475972175598, "learning_rate": 7.879258856724913e-08, "loss": 0.6687, "step": 4684 }, { "epoch": 0.9451310808297636, "grad_norm": 0.6435663104057312, "learning_rate": 7.821580876912705e-08, "loss": 0.8308, "step": 4685 }, { "epoch": 0.9453328163859706, "grad_norm": 0.9628399610519409, "learning_rate": 7.764113117110506e-08, "loss": 0.6806, "step": 4686 }, { "epoch": 0.9455345519421775, "grad_norm": 0.33533141016960144, "learning_rate": 7.706855601864238e-08, "loss": 0.6426, "step": 4687 }, { "epoch": 0.9457362874983846, "grad_norm": 0.5895031094551086, "learning_rate": 7.649808355629729e-08, "loss": 0.6142, "step": 4688 }, { "epoch": 0.9459380230545915, "grad_norm": 0.7705016136169434, "learning_rate": 7.592971402773042e-08, "loss": 0.6929, "step": 4689 }, { "epoch": 0.9461397586107986, "grad_norm": 0.40751463174819946, "learning_rate": 7.536344767570536e-08, "loss": 0.6995, "step": 4690 }, { "epoch": 0.9463414941670055, "grad_norm": 0.7165273427963257, "learning_rate": 7.479928474208586e-08, "loss": 0.6518, "step": 4691 }, { "epoch": 0.9465432297232125, "grad_norm": 0.5997741222381592, "learning_rate": 7.423722546783918e-08, "loss": 0.6692, "step": 4692 }, { "epoch": 0.9467449652794195, "grad_norm": 0.7366072535514832, "learning_rate": 7.367727009303216e-08, "loss": 0.6946, "step": 4693 }, { "epoch": 0.9469467008356265, "grad_norm": 0.38420969247817993, "learning_rate": 7.311941885683405e-08, "loss": 0.705, "step": 4694 }, { "epoch": 0.9471484363918334, "grad_norm": 0.37986132502555847, "learning_rate": 7.25636719975148e-08, "loss": 0.6504, "step": 4695 }, { "epoch": 0.9473501719480405, "grad_norm": 0.5251741409301758, "learning_rate": 7.201002975244676e-08, "loss": 0.6562, "step": 4696 }, { "epoch": 0.9475519075042474, "grad_norm": 0.5595903396606445, "learning_rate": 7.145849235810131e-08, "loss": 0.7736, "step": 4697 }, { "epoch": 0.9477536430604545, "grad_norm": 0.36443448066711426, "learning_rate": 7.090906005005283e-08, "loss": 0.8156, "step": 4698 }, { "epoch": 0.9479553786166615, "grad_norm": 0.3789260983467102, "learning_rate": 7.036173306297522e-08, "loss": 0.7545, "step": 4699 }, { "epoch": 0.9481571141728684, "grad_norm": 1.2341930866241455, "learning_rate": 6.981651163064374e-08, "loss": 0.7395, "step": 4700 }, { "epoch": 0.9483588497290755, "grad_norm": 0.9317732453346252, "learning_rate": 6.927339598593485e-08, "loss": 0.81, "step": 4701 }, { "epoch": 0.9485605852852824, "grad_norm": 0.46789565682411194, "learning_rate": 6.873238636082358e-08, "loss": 0.6293, "step": 4702 }, { "epoch": 0.9487623208414895, "grad_norm": 0.8435044884681702, "learning_rate": 6.819348298638839e-08, "loss": 0.7131, "step": 4703 }, { "epoch": 0.9489640563976964, "grad_norm": 0.5798925161361694, "learning_rate": 6.765668609280519e-08, "loss": 0.7062, "step": 4704 }, { "epoch": 0.9491657919539034, "grad_norm": 0.49503710865974426, "learning_rate": 6.71219959093522e-08, "loss": 0.8745, "step": 4705 }, { "epoch": 0.9493675275101104, "grad_norm": 0.36657482385635376, "learning_rate": 6.658941266440677e-08, "loss": 0.6438, "step": 4706 }, { "epoch": 0.9495692630663174, "grad_norm": 0.9516401290893555, "learning_rate": 6.605893658544693e-08, "loss": 0.6972, "step": 4707 }, { "epoch": 0.9497709986225245, "grad_norm": 0.5418645739555359, "learning_rate": 6.553056789905032e-08, "loss": 0.634, "step": 4708 }, { "epoch": 0.9499727341787314, "grad_norm": 0.43398237228393555, "learning_rate": 6.500430683089532e-08, "loss": 0.7135, "step": 4709 }, { "epoch": 0.9501744697349384, "grad_norm": 0.4190267324447632, "learning_rate": 6.448015360575821e-08, "loss": 0.6784, "step": 4710 }, { "epoch": 0.9503762052911454, "grad_norm": 0.38937604427337646, "learning_rate": 6.395810844751604e-08, "loss": 0.8755, "step": 4711 }, { "epoch": 0.9505779408473524, "grad_norm": 0.4863095283508301, "learning_rate": 6.343817157914712e-08, "loss": 0.8249, "step": 4712 }, { "epoch": 0.9507796764035594, "grad_norm": 0.5813011527061462, "learning_rate": 6.292034322272656e-08, "loss": 0.6882, "step": 4713 }, { "epoch": 0.9509814119597664, "grad_norm": 0.3933367431163788, "learning_rate": 6.240462359942967e-08, "loss": 0.7558, "step": 4714 }, { "epoch": 0.9511831475159733, "grad_norm": 0.4394649267196655, "learning_rate": 6.189101292953247e-08, "loss": 0.6481, "step": 4715 }, { "epoch": 0.9513848830721804, "grad_norm": 0.39462268352508545, "learning_rate": 6.137951143240783e-08, "loss": 0.6553, "step": 4716 }, { "epoch": 0.9515866186283873, "grad_norm": 0.7447942495346069, "learning_rate": 6.087011932653097e-08, "loss": 0.6813, "step": 4717 }, { "epoch": 0.9517883541845943, "grad_norm": 0.48867762088775635, "learning_rate": 6.036283682947231e-08, "loss": 0.6461, "step": 4718 }, { "epoch": 0.9519900897408013, "grad_norm": 0.320430189371109, "learning_rate": 5.98576641579035e-08, "loss": 0.6561, "step": 4719 }, { "epoch": 0.9521918252970083, "grad_norm": 0.659767746925354, "learning_rate": 5.935460152759642e-08, "loss": 0.676, "step": 4720 }, { "epoch": 0.9523935608532154, "grad_norm": 0.7774620652198792, "learning_rate": 5.8853649153417515e-08, "loss": 0.6979, "step": 4721 }, { "epoch": 0.9525952964094223, "grad_norm": 0.3772054612636566, "learning_rate": 5.835480724933562e-08, "loss": 0.7595, "step": 4722 }, { "epoch": 0.9527970319656293, "grad_norm": 0.33030465245246887, "learning_rate": 5.7858076028416975e-08, "loss": 0.6754, "step": 4723 }, { "epoch": 0.9529987675218363, "grad_norm": 0.47675344347953796, "learning_rate": 5.736345570282575e-08, "loss": 0.8973, "step": 4724 }, { "epoch": 0.9532005030780433, "grad_norm": 0.5579949021339417, "learning_rate": 5.687094648382518e-08, "loss": 0.703, "step": 4725 }, { "epoch": 0.9534022386342503, "grad_norm": 0.47706276178359985, "learning_rate": 5.638054858177644e-08, "loss": 0.707, "step": 4726 }, { "epoch": 0.9536039741904573, "grad_norm": 0.3631863296031952, "learning_rate": 5.589226220613919e-08, "loss": 0.6557, "step": 4727 }, { "epoch": 0.9538057097466642, "grad_norm": 0.5944626927375793, "learning_rate": 5.5406087565471054e-08, "loss": 0.6658, "step": 4728 }, { "epoch": 0.9540074453028713, "grad_norm": 0.579210102558136, "learning_rate": 5.492202486742759e-08, "loss": 0.7539, "step": 4729 }, { "epoch": 0.9542091808590782, "grad_norm": 0.4768008887767792, "learning_rate": 5.44400743187623e-08, "loss": 0.6639, "step": 4730 }, { "epoch": 0.9544109164152853, "grad_norm": 0.3371616005897522, "learning_rate": 5.396023612532719e-08, "loss": 0.6591, "step": 4731 }, { "epoch": 0.9546126519714923, "grad_norm": 0.34802260994911194, "learning_rate": 5.348251049207054e-08, "loss": 0.9614, "step": 4732 }, { "epoch": 0.9548143875276992, "grad_norm": 0.36523744463920593, "learning_rate": 5.300689762304023e-08, "loss": 0.8026, "step": 4733 }, { "epoch": 0.9550161230839063, "grad_norm": 0.5702638626098633, "learning_rate": 5.2533397721379887e-08, "loss": 0.5845, "step": 4734 }, { "epoch": 0.9552178586401132, "grad_norm": 0.44271722435951233, "learning_rate": 5.206201098933217e-08, "loss": 0.8374, "step": 4735 }, { "epoch": 0.9554195941963202, "grad_norm": 0.5539191365242004, "learning_rate": 5.159273762823658e-08, "loss": 0.6885, "step": 4736 }, { "epoch": 0.9556213297525272, "grad_norm": 0.6696466207504272, "learning_rate": 5.112557783852945e-08, "loss": 0.6233, "step": 4737 }, { "epoch": 0.9558230653087342, "grad_norm": 0.33567628264427185, "learning_rate": 5.0660531819745065e-08, "loss": 0.6452, "step": 4738 }, { "epoch": 0.9560248008649412, "grad_norm": 0.8079859614372253, "learning_rate": 5.0197599770514524e-08, "loss": 0.6868, "step": 4739 }, { "epoch": 0.9562265364211482, "grad_norm": 0.4152475595474243, "learning_rate": 4.9736781888566345e-08, "loss": 0.6861, "step": 4740 }, { "epoch": 0.9564282719773551, "grad_norm": 0.3019008934497833, "learning_rate": 4.927807837072529e-08, "loss": 0.6617, "step": 4741 }, { "epoch": 0.9566300075335622, "grad_norm": 0.43031102418899536, "learning_rate": 4.882148941291298e-08, "loss": 0.8175, "step": 4742 }, { "epoch": 0.9568317430897691, "grad_norm": 0.5170923471450806, "learning_rate": 4.836701521015008e-08, "loss": 0.6904, "step": 4743 }, { "epoch": 0.9570334786459762, "grad_norm": 0.44814711809158325, "learning_rate": 4.791465595655132e-08, "loss": 0.6623, "step": 4744 }, { "epoch": 0.9572352142021832, "grad_norm": 2.267430305480957, "learning_rate": 4.746441184532879e-08, "loss": 0.801, "step": 4745 }, { "epoch": 0.9574369497583901, "grad_norm": 0.6424638628959656, "learning_rate": 4.701628306879202e-08, "loss": 0.7212, "step": 4746 }, { "epoch": 0.9576386853145972, "grad_norm": 0.7151271104812622, "learning_rate": 4.657026981834623e-08, "loss": 0.64, "step": 4747 }, { "epoch": 0.9578404208708041, "grad_norm": 1.2689751386642456, "learning_rate": 4.612637228449346e-08, "loss": 0.6775, "step": 4748 }, { "epoch": 0.9580421564270112, "grad_norm": 0.46473121643066406, "learning_rate": 4.568459065683206e-08, "loss": 0.799, "step": 4749 }, { "epoch": 0.9582438919832181, "grad_norm": 0.44650596380233765, "learning_rate": 4.524492512405554e-08, "loss": 0.6784, "step": 4750 }, { "epoch": 0.9584456275394251, "grad_norm": 0.7469905018806458, "learning_rate": 4.4807375873955336e-08, "loss": 0.6511, "step": 4751 }, { "epoch": 0.9586473630956321, "grad_norm": 0.5191195607185364, "learning_rate": 4.437194309341808e-08, "loss": 0.7911, "step": 4752 }, { "epoch": 0.9588490986518391, "grad_norm": 1.0587661266326904, "learning_rate": 4.393862696842666e-08, "loss": 0.6558, "step": 4753 }, { "epoch": 0.959050834208046, "grad_norm": 0.4224800169467926, "learning_rate": 4.350742768405913e-08, "loss": 0.6558, "step": 4754 }, { "epoch": 0.9592525697642531, "grad_norm": 0.4730607271194458, "learning_rate": 4.307834542449096e-08, "loss": 0.6371, "step": 4755 }, { "epoch": 0.95945430532046, "grad_norm": 0.6827614903450012, "learning_rate": 4.26513803729911e-08, "loss": 0.7278, "step": 4756 }, { "epoch": 0.9596560408766671, "grad_norm": 0.78753262758255, "learning_rate": 4.2226532711927005e-08, "loss": 0.6537, "step": 4757 }, { "epoch": 0.9598577764328741, "grad_norm": 0.39794209599494934, "learning_rate": 4.180380262275907e-08, "loss": 0.6444, "step": 4758 }, { "epoch": 0.960059511989081, "grad_norm": 0.3673033118247986, "learning_rate": 4.138319028604509e-08, "loss": 0.8612, "step": 4759 }, { "epoch": 0.9602612475452881, "grad_norm": 0.659838855266571, "learning_rate": 4.0964695881437475e-08, "loss": 0.6587, "step": 4760 }, { "epoch": 0.960462983101495, "grad_norm": 0.6321941614151001, "learning_rate": 4.054831958768435e-08, "loss": 0.6918, "step": 4761 }, { "epoch": 0.9606647186577021, "grad_norm": 0.5344189405441284, "learning_rate": 4.0134061582628446e-08, "loss": 0.7182, "step": 4762 }, { "epoch": 0.960866454213909, "grad_norm": 0.797895610332489, "learning_rate": 3.9721922043208797e-08, "loss": 0.6649, "step": 4763 }, { "epoch": 0.961068189770116, "grad_norm": 0.526902437210083, "learning_rate": 3.931190114545902e-08, "loss": 0.624, "step": 4764 }, { "epoch": 0.961269925326323, "grad_norm": 0.4661473035812378, "learning_rate": 3.8903999064507923e-08, "loss": 0.6719, "step": 4765 }, { "epoch": 0.96147166088253, "grad_norm": 0.5854642987251282, "learning_rate": 3.849821597457892e-08, "loss": 0.6386, "step": 4766 }, { "epoch": 0.9616733964387371, "grad_norm": 0.49384617805480957, "learning_rate": 3.809455204899115e-08, "loss": 0.6907, "step": 4767 }, { "epoch": 0.961875131994944, "grad_norm": 0.49993792176246643, "learning_rate": 3.769300746015836e-08, "loss": 0.7345, "step": 4768 }, { "epoch": 0.962076867551151, "grad_norm": 0.6220202445983887, "learning_rate": 3.72935823795878e-08, "loss": 0.7145, "step": 4769 }, { "epoch": 0.962278603107358, "grad_norm": 0.8840739727020264, "learning_rate": 3.689627697788356e-08, "loss": 0.7168, "step": 4770 }, { "epoch": 0.962480338663565, "grad_norm": 0.34587568044662476, "learning_rate": 3.650109142474323e-08, "loss": 0.6895, "step": 4771 }, { "epoch": 0.9626820742197719, "grad_norm": 0.3984651267528534, "learning_rate": 3.610802588895845e-08, "loss": 0.6525, "step": 4772 }, { "epoch": 0.962883809775979, "grad_norm": 1.5117933750152588, "learning_rate": 3.571708053841716e-08, "loss": 0.7711, "step": 4773 }, { "epoch": 0.9630855453321859, "grad_norm": 0.4134984314441681, "learning_rate": 3.532825554009966e-08, "loss": 0.6177, "step": 4774 }, { "epoch": 0.963287280888393, "grad_norm": 0.6590429544448853, "learning_rate": 3.49415510600809e-08, "loss": 0.7145, "step": 4775 }, { "epoch": 0.9634890164445999, "grad_norm": 0.44440943002700806, "learning_rate": 3.455696726353208e-08, "loss": 0.6574, "step": 4776 }, { "epoch": 0.9636907520008069, "grad_norm": 0.529772162437439, "learning_rate": 3.417450431471625e-08, "loss": 0.6641, "step": 4777 }, { "epoch": 0.963892487557014, "grad_norm": 0.7167370915412903, "learning_rate": 3.379416237699218e-08, "loss": 0.8083, "step": 4778 }, { "epoch": 0.9640942231132209, "grad_norm": 1.930140495300293, "learning_rate": 3.341594161281214e-08, "loss": 0.657, "step": 4779 }, { "epoch": 0.964295958669428, "grad_norm": 0.4069491922855377, "learning_rate": 3.303984218372136e-08, "loss": 0.7373, "step": 4780 }, { "epoch": 0.9644976942256349, "grad_norm": 0.38006487488746643, "learning_rate": 3.2665864250360777e-08, "loss": 0.6501, "step": 4781 }, { "epoch": 0.9646994297818419, "grad_norm": 0.4782893657684326, "learning_rate": 3.2294007972464845e-08, "loss": 0.6497, "step": 4782 }, { "epoch": 0.9649011653380489, "grad_norm": 0.32621467113494873, "learning_rate": 3.19242735088604e-08, "loss": 0.8127, "step": 4783 }, { "epoch": 0.9651029008942559, "grad_norm": 0.5363353490829468, "learning_rate": 3.155666101747001e-08, "loss": 0.6488, "step": 4784 }, { "epoch": 0.9653046364504629, "grad_norm": 0.4314109981060028, "learning_rate": 3.119117065530808e-08, "loss": 0.8215, "step": 4785 }, { "epoch": 0.9655063720066699, "grad_norm": 0.38545098900794983, "learning_rate": 3.082780257848361e-08, "loss": 0.6359, "step": 4786 }, { "epoch": 0.9657081075628768, "grad_norm": 0.41607731580734253, "learning_rate": 3.046655694219969e-08, "loss": 0.6006, "step": 4787 }, { "epoch": 0.9659098431190839, "grad_norm": 0.7114999890327454, "learning_rate": 3.0107433900751216e-08, "loss": 0.7779, "step": 4788 }, { "epoch": 0.9661115786752908, "grad_norm": 0.5416654944419861, "learning_rate": 2.9750433607527163e-08, "loss": 0.6834, "step": 4789 }, { "epoch": 0.9663133142314978, "grad_norm": 0.6984711289405823, "learning_rate": 2.9395556215011113e-08, "loss": 0.6532, "step": 4790 }, { "epoch": 0.9665150497877049, "grad_norm": 0.5756306648254395, "learning_rate": 2.9042801874777925e-08, "loss": 0.7076, "step": 4791 }, { "epoch": 0.9667167853439118, "grad_norm": 0.7223901152610779, "learning_rate": 2.8692170737497083e-08, "loss": 0.6489, "step": 4792 }, { "epoch": 0.9669185209001189, "grad_norm": 0.4999725818634033, "learning_rate": 2.8343662952931005e-08, "loss": 0.7008, "step": 4793 }, { "epoch": 0.9671202564563258, "grad_norm": 0.700188398361206, "learning_rate": 2.7997278669933405e-08, "loss": 0.6829, "step": 4794 }, { "epoch": 0.9673219920125328, "grad_norm": 0.726335883140564, "learning_rate": 2.765301803645426e-08, "loss": 0.7448, "step": 4795 }, { "epoch": 0.9675237275687398, "grad_norm": 0.7787569761276245, "learning_rate": 2.7310881199533736e-08, "loss": 0.6284, "step": 4796 }, { "epoch": 0.9677254631249468, "grad_norm": 0.5372986197471619, "learning_rate": 2.69708683053066e-08, "loss": 0.6637, "step": 4797 }, { "epoch": 0.9679271986811538, "grad_norm": 1.1246623992919922, "learning_rate": 2.6632979498998347e-08, "loss": 0.6536, "step": 4798 }, { "epoch": 0.9681289342373608, "grad_norm": 0.39886438846588135, "learning_rate": 2.629721492492965e-08, "loss": 0.6609, "step": 4799 }, { "epoch": 0.9683306697935677, "grad_norm": 0.9603978991508484, "learning_rate": 2.5963574726512454e-08, "loss": 0.6716, "step": 4800 }, { "epoch": 0.9685324053497748, "grad_norm": 0.40881672501564026, "learning_rate": 2.5632059046251655e-08, "loss": 0.6358, "step": 4801 }, { "epoch": 0.9687341409059818, "grad_norm": 0.9251068830490112, "learning_rate": 2.53026680257451e-08, "loss": 0.6458, "step": 4802 }, { "epoch": 0.9689358764621888, "grad_norm": 0.9787841439247131, "learning_rate": 2.4975401805682475e-08, "loss": 0.7053, "step": 4803 }, { "epoch": 0.9691376120183958, "grad_norm": 0.4082057774066925, "learning_rate": 2.4650260525846404e-08, "loss": 0.8172, "step": 4804 }, { "epoch": 0.9693393475746027, "grad_norm": 0.5025261044502258, "learning_rate": 2.4327244325111354e-08, "loss": 0.8375, "step": 4805 }, { "epoch": 0.9695410831308098, "grad_norm": 0.45469751954078674, "learning_rate": 2.4006353341444745e-08, "loss": 0.822, "step": 4806 }, { "epoch": 0.9697428186870167, "grad_norm": 0.6082906126976013, "learning_rate": 2.3687587711905825e-08, "loss": 0.6237, "step": 4807 }, { "epoch": 0.9699445542432238, "grad_norm": 0.8955042362213135, "learning_rate": 2.3370947572646796e-08, "loss": 0.8761, "step": 4808 }, { "epoch": 0.9701462897994307, "grad_norm": 0.3564375638961792, "learning_rate": 2.3056433058911142e-08, "loss": 0.6063, "step": 4809 }, { "epoch": 0.9703480253556377, "grad_norm": 0.4515208303928375, "learning_rate": 2.274404430503474e-08, "loss": 0.7077, "step": 4810 }, { "epoch": 0.9705497609118447, "grad_norm": 0.4937743842601776, "learning_rate": 2.2433781444445858e-08, "loss": 0.6696, "step": 4811 }, { "epoch": 0.9707514964680517, "grad_norm": 0.7430995106697083, "learning_rate": 2.2125644609664042e-08, "loss": 0.7526, "step": 4812 }, { "epoch": 0.9709532320242587, "grad_norm": 1.061241865158081, "learning_rate": 2.1819633932301797e-08, "loss": 0.6511, "step": 4813 }, { "epoch": 0.9711549675804657, "grad_norm": 0.3608217239379883, "learning_rate": 2.1515749543061792e-08, "loss": 0.6436, "step": 4814 }, { "epoch": 0.9713567031366727, "grad_norm": 0.3461396396160126, "learning_rate": 2.1213991571740755e-08, "loss": 0.6623, "step": 4815 }, { "epoch": 0.9715584386928797, "grad_norm": 0.46052438020706177, "learning_rate": 2.0914360147225033e-08, "loss": 0.6507, "step": 4816 }, { "epoch": 0.9717601742490867, "grad_norm": 0.3347858786582947, "learning_rate": 2.0616855397494472e-08, "loss": 0.6507, "step": 4817 }, { "epoch": 0.9719619098052936, "grad_norm": 1.116852879524231, "learning_rate": 2.0321477449619098e-08, "loss": 0.6457, "step": 4818 }, { "epoch": 0.9721636453615007, "grad_norm": 0.8443088531494141, "learning_rate": 2.0028226429762433e-08, "loss": 0.7697, "step": 4819 }, { "epoch": 0.9723653809177076, "grad_norm": 0.7438956499099731, "learning_rate": 1.9737102463176504e-08, "loss": 0.8288, "step": 4820 }, { "epoch": 0.9725671164739147, "grad_norm": 0.5241464972496033, "learning_rate": 1.944810567420796e-08, "loss": 0.678, "step": 4821 }, { "epoch": 0.9727688520301216, "grad_norm": 0.4894596040248871, "learning_rate": 1.9161236186293063e-08, "loss": 1.0671, "step": 4822 }, { "epoch": 0.9729705875863286, "grad_norm": 0.7723326683044434, "learning_rate": 1.8876494121959908e-08, "loss": 0.6433, "step": 4823 }, { "epoch": 0.9731723231425357, "grad_norm": 0.46578657627105713, "learning_rate": 1.8593879602828434e-08, "loss": 0.7017, "step": 4824 }, { "epoch": 0.9733740586987426, "grad_norm": 0.671375036239624, "learning_rate": 1.831339274960875e-08, "loss": 0.6607, "step": 4825 }, { "epoch": 0.9735757942549497, "grad_norm": 0.4858841001987457, "learning_rate": 1.8035033682103353e-08, "loss": 0.7833, "step": 4826 }, { "epoch": 0.9737775298111566, "grad_norm": 0.5803720951080322, "learning_rate": 1.7758802519204922e-08, "loss": 0.6924, "step": 4827 }, { "epoch": 0.9739792653673636, "grad_norm": 2.0239927768707275, "learning_rate": 1.7484699378897962e-08, "loss": 0.7044, "step": 4828 }, { "epoch": 0.9741810009235706, "grad_norm": 1.0976885557174683, "learning_rate": 1.721272437825827e-08, "loss": 0.6594, "step": 4829 }, { "epoch": 0.9743827364797776, "grad_norm": 0.34820252656936646, "learning_rate": 1.6942877633451815e-08, "loss": 0.8396, "step": 4830 }, { "epoch": 0.9745844720359845, "grad_norm": 0.6627471446990967, "learning_rate": 1.6675159259735285e-08, "loss": 0.677, "step": 4831 }, { "epoch": 0.9747862075921916, "grad_norm": 0.5685920715332031, "learning_rate": 1.6409569371458323e-08, "loss": 0.8709, "step": 4832 }, { "epoch": 0.9749879431483985, "grad_norm": 0.439563512802124, "learning_rate": 1.6146108082059075e-08, "loss": 0.6527, "step": 4833 }, { "epoch": 0.9751896787046056, "grad_norm": 1.0685017108917236, "learning_rate": 1.5884775504068083e-08, "loss": 0.7461, "step": 4834 }, { "epoch": 0.9753914142608126, "grad_norm": 0.8692358136177063, "learning_rate": 1.562557174910606e-08, "loss": 0.6507, "step": 4835 }, { "epoch": 0.9755931498170195, "grad_norm": 0.4101846218109131, "learning_rate": 1.5368496927884447e-08, "loss": 0.6549, "step": 4836 }, { "epoch": 0.9757948853732266, "grad_norm": 0.41572755575180054, "learning_rate": 1.5113551150204853e-08, "loss": 0.6295, "step": 4837 }, { "epoch": 0.9759966209294335, "grad_norm": 0.35794496536254883, "learning_rate": 1.4860734524961285e-08, "loss": 0.6945, "step": 4838 }, { "epoch": 0.9761983564856406, "grad_norm": 0.6017243266105652, "learning_rate": 1.4610047160136254e-08, "loss": 0.7921, "step": 4839 }, { "epoch": 0.9764000920418475, "grad_norm": 0.34502172470092773, "learning_rate": 1.4361489162804109e-08, "loss": 0.713, "step": 4840 }, { "epoch": 0.9766018275980545, "grad_norm": 0.6996079683303833, "learning_rate": 1.411506063912882e-08, "loss": 0.7164, "step": 4841 }, { "epoch": 0.9768035631542615, "grad_norm": 0.4020618200302124, "learning_rate": 1.387076169436563e-08, "loss": 0.6264, "step": 4842 }, { "epoch": 0.9770052987104685, "grad_norm": 1.5375221967697144, "learning_rate": 1.3628592432861077e-08, "loss": 0.8928, "step": 4843 }, { "epoch": 0.9772070342666755, "grad_norm": 0.7670455574989319, "learning_rate": 1.3388552958048529e-08, "loss": 0.6474, "step": 4844 }, { "epoch": 0.9774087698228825, "grad_norm": 0.4482732117176056, "learning_rate": 1.3150643372455973e-08, "loss": 0.6652, "step": 4845 }, { "epoch": 0.9776105053790894, "grad_norm": 0.4064268469810486, "learning_rate": 1.2914863777698794e-08, "loss": 0.6561, "step": 4846 }, { "epoch": 0.9778122409352965, "grad_norm": 0.745275616645813, "learning_rate": 1.2681214274483655e-08, "loss": 0.6295, "step": 4847 }, { "epoch": 0.9780139764915035, "grad_norm": 1.2450134754180908, "learning_rate": 1.244969496260795e-08, "loss": 0.6317, "step": 4848 }, { "epoch": 0.9782157120477104, "grad_norm": 0.8189371228218079, "learning_rate": 1.2220305940957578e-08, "loss": 0.6434, "step": 4849 }, { "epoch": 0.9784174476039175, "grad_norm": 0.8365606665611267, "learning_rate": 1.199304730750972e-08, "loss": 0.6992, "step": 4850 }, { "epoch": 0.9786191831601244, "grad_norm": 0.40646567940711975, "learning_rate": 1.1767919159332286e-08, "loss": 0.7065, "step": 4851 }, { "epoch": 0.9788209187163315, "grad_norm": 0.6693828701972961, "learning_rate": 1.1544921592581138e-08, "loss": 0.6444, "step": 4852 }, { "epoch": 0.9790226542725384, "grad_norm": 0.7508454918861389, "learning_rate": 1.1324054702504528e-08, "loss": 0.6489, "step": 4853 }, { "epoch": 0.9792243898287454, "grad_norm": 0.4734887480735779, "learning_rate": 1.1105318583438663e-08, "loss": 0.6898, "step": 4854 }, { "epoch": 0.9794261253849524, "grad_norm": 0.7000921368598938, "learning_rate": 1.0888713328810474e-08, "loss": 0.6613, "step": 4855 }, { "epoch": 0.9796278609411594, "grad_norm": 0.569733738899231, "learning_rate": 1.0674239031137069e-08, "loss": 0.6622, "step": 4856 }, { "epoch": 0.9798295964973665, "grad_norm": 0.376286119222641, "learning_rate": 1.0461895782025166e-08, "loss": 0.6579, "step": 4857 }, { "epoch": 0.9800313320535734, "grad_norm": 1.5786285400390625, "learning_rate": 1.0251683672170554e-08, "loss": 0.7327, "step": 4858 }, { "epoch": 0.9802330676097804, "grad_norm": 0.4468984007835388, "learning_rate": 1.0043602791360297e-08, "loss": 0.6494, "step": 4859 }, { "epoch": 0.9804348031659874, "grad_norm": 0.31856024265289307, "learning_rate": 9.837653228469413e-09, "loss": 0.6477, "step": 4860 }, { "epoch": 0.9806365387221944, "grad_norm": 0.6004407405853271, "learning_rate": 9.633835071463094e-09, "loss": 0.7727, "step": 4861 }, { "epoch": 0.9808382742784014, "grad_norm": 0.8156235218048096, "learning_rate": 9.432148407397257e-09, "loss": 0.6428, "step": 4862 }, { "epoch": 0.9810400098346084, "grad_norm": 0.3582519590854645, "learning_rate": 9.232593322416883e-09, "loss": 0.6174, "step": 4863 }, { "epoch": 0.9812417453908153, "grad_norm": 0.4001818299293518, "learning_rate": 9.035169901754902e-09, "loss": 0.6688, "step": 4864 }, { "epoch": 0.9814434809470224, "grad_norm": 0.4952855110168457, "learning_rate": 8.839878229736643e-09, "loss": 0.6671, "step": 4865 }, { "epoch": 0.9816452165032293, "grad_norm": 0.34948718547821045, "learning_rate": 8.646718389774267e-09, "loss": 0.6613, "step": 4866 }, { "epoch": 0.9818469520594363, "grad_norm": 0.6136161684989929, "learning_rate": 8.455690464371224e-09, "loss": 0.7906, "step": 4867 }, { "epoch": 0.9820486876156433, "grad_norm": 0.5521015524864197, "learning_rate": 8.266794535118915e-09, "loss": 0.7189, "step": 4868 }, { "epoch": 0.9822504231718503, "grad_norm": 0.5386425256729126, "learning_rate": 8.08003068269947e-09, "loss": 0.6319, "step": 4869 }, { "epoch": 0.9824521587280574, "grad_norm": 1.521870493888855, "learning_rate": 7.895398986883518e-09, "loss": 0.8588, "step": 4870 }, { "epoch": 0.9826538942842643, "grad_norm": 1.0882370471954346, "learning_rate": 7.71289952653187e-09, "loss": 0.6604, "step": 4871 }, { "epoch": 0.9828556298404713, "grad_norm": 0.4301353991031647, "learning_rate": 7.532532379592728e-09, "loss": 0.6396, "step": 4872 }, { "epoch": 0.9830573653966783, "grad_norm": 0.7053226232528687, "learning_rate": 7.354297623105577e-09, "loss": 0.6267, "step": 4873 }, { "epoch": 0.9832591009528853, "grad_norm": 0.5108756422996521, "learning_rate": 7.1781953331984125e-09, "loss": 0.8235, "step": 4874 }, { "epoch": 0.9834608365090923, "grad_norm": 0.5003656148910522, "learning_rate": 7.004225585088287e-09, "loss": 0.6547, "step": 4875 }, { "epoch": 0.9836625720652993, "grad_norm": 1.7763170003890991, "learning_rate": 6.832388453080762e-09, "loss": 0.6739, "step": 4876 }, { "epoch": 0.9838643076215062, "grad_norm": 0.5061050653457642, "learning_rate": 6.662684010572129e-09, "loss": 0.8183, "step": 4877 }, { "epoch": 0.9840660431777133, "grad_norm": 0.46445515751838684, "learning_rate": 6.495112330046072e-09, "loss": 0.6868, "step": 4878 }, { "epoch": 0.9842677787339202, "grad_norm": 0.35919633507728577, "learning_rate": 6.329673483076448e-09, "loss": 0.6631, "step": 4879 }, { "epoch": 0.9844695142901273, "grad_norm": 0.6525983810424805, "learning_rate": 6.166367540325624e-09, "loss": 0.6468, "step": 4880 }, { "epoch": 0.9846712498463343, "grad_norm": 0.4751875102519989, "learning_rate": 6.005194571545581e-09, "loss": 0.6931, "step": 4881 }, { "epoch": 0.9848729854025412, "grad_norm": 0.560192346572876, "learning_rate": 5.846154645575697e-09, "loss": 0.8019, "step": 4882 }, { "epoch": 0.9850747209587483, "grad_norm": 0.5137495398521423, "learning_rate": 5.689247830346079e-09, "loss": 0.68, "step": 4883 }, { "epoch": 0.9852764565149552, "grad_norm": 0.4108632504940033, "learning_rate": 5.534474192875339e-09, "loss": 0.8589, "step": 4884 }, { "epoch": 0.9854781920711622, "grad_norm": 0.3733404278755188, "learning_rate": 5.381833799269487e-09, "loss": 0.6393, "step": 4885 }, { "epoch": 0.9856799276273692, "grad_norm": 0.4676814377307892, "learning_rate": 5.231326714725815e-09, "loss": 0.6373, "step": 4886 }, { "epoch": 0.9858816631835762, "grad_norm": 0.6283035278320312, "learning_rate": 5.082953003528457e-09, "loss": 0.9995, "step": 4887 }, { "epoch": 0.9860833987397832, "grad_norm": 0.49630334973335266, "learning_rate": 4.936712729051163e-09, "loss": 0.7571, "step": 4888 }, { "epoch": 0.9862851342959902, "grad_norm": 0.45282241702079773, "learning_rate": 4.792605953756191e-09, "loss": 0.8741, "step": 4889 }, { "epoch": 0.9864868698521971, "grad_norm": 0.4955006241798401, "learning_rate": 4.650632739194305e-09, "loss": 0.6935, "step": 4890 }, { "epoch": 0.9866886054084042, "grad_norm": 0.7275412082672119, "learning_rate": 4.510793146006442e-09, "loss": 0.9353, "step": 4891 }, { "epoch": 0.9868903409646111, "grad_norm": 0.4954439103603363, "learning_rate": 4.373087233919826e-09, "loss": 0.6653, "step": 4892 }, { "epoch": 0.9870920765208182, "grad_norm": 0.3884679079055786, "learning_rate": 4.2375150617529615e-09, "loss": 0.6975, "step": 4893 }, { "epoch": 0.9872938120770252, "grad_norm": 0.42031583189964294, "learning_rate": 4.104076687410086e-09, "loss": 0.7541, "step": 4894 }, { "epoch": 0.9874955476332321, "grad_norm": 0.3728240728378296, "learning_rate": 3.972772167886718e-09, "loss": 0.6141, "step": 4895 }, { "epoch": 0.9876972831894392, "grad_norm": 0.4875032305717468, "learning_rate": 3.843601559265775e-09, "loss": 0.8307, "step": 4896 }, { "epoch": 0.9878990187456461, "grad_norm": 0.7961161136627197, "learning_rate": 3.716564916718124e-09, "loss": 0.7869, "step": 4897 }, { "epoch": 0.9881007543018532, "grad_norm": 0.7322169542312622, "learning_rate": 3.591662294504805e-09, "loss": 0.6661, "step": 4898 }, { "epoch": 0.9883024898580601, "grad_norm": 0.3394564986228943, "learning_rate": 3.4688937459737004e-09, "loss": 0.69, "step": 4899 }, { "epoch": 0.9885042254142671, "grad_norm": 0.9545552730560303, "learning_rate": 3.3482593235617533e-09, "loss": 0.6647, "step": 4900 }, { "epoch": 0.9887059609704741, "grad_norm": 0.3521050214767456, "learning_rate": 3.2297590787955248e-09, "loss": 0.6554, "step": 4901 }, { "epoch": 0.9889076965266811, "grad_norm": 0.4236242473125458, "learning_rate": 3.1133930622878618e-09, "loss": 0.8047, "step": 4902 }, { "epoch": 0.9891094320828882, "grad_norm": 0.4478450119495392, "learning_rate": 2.9991613237417837e-09, "loss": 0.7946, "step": 4903 }, { "epoch": 0.9893111676390951, "grad_norm": 0.8264681696891785, "learning_rate": 2.8870639119482622e-09, "loss": 0.7464, "step": 4904 }, { "epoch": 0.989512903195302, "grad_norm": 1.0444552898406982, "learning_rate": 2.7771008747867757e-09, "loss": 0.6486, "step": 4905 }, { "epoch": 0.9897146387515091, "grad_norm": 0.34184062480926514, "learning_rate": 2.669272259223643e-09, "loss": 0.6797, "step": 4906 }, { "epoch": 0.9899163743077161, "grad_norm": 0.42163538932800293, "learning_rate": 2.563578111315912e-09, "loss": 0.6164, "step": 4907 }, { "epoch": 0.990118109863923, "grad_norm": 0.41893240809440613, "learning_rate": 2.460018476207471e-09, "loss": 0.6805, "step": 4908 }, { "epoch": 0.9903198454201301, "grad_norm": 0.4893782138824463, "learning_rate": 2.3585933981312704e-09, "loss": 0.6462, "step": 4909 }, { "epoch": 0.990521580976337, "grad_norm": 0.3577900230884552, "learning_rate": 2.2593029204076578e-09, "loss": 0.7524, "step": 4910 }, { "epoch": 0.9907233165325441, "grad_norm": 0.3445039391517639, "learning_rate": 2.1621470854454874e-09, "loss": 0.6344, "step": 4911 }, { "epoch": 0.990925052088751, "grad_norm": 0.4528195261955261, "learning_rate": 2.067125934742675e-09, "loss": 0.6931, "step": 4912 }, { "epoch": 0.991126787644958, "grad_norm": 0.5998149514198303, "learning_rate": 1.9742395088845346e-09, "loss": 0.7681, "step": 4913 }, { "epoch": 0.991328523201165, "grad_norm": 0.3263251483440399, "learning_rate": 1.8834878475454398e-09, "loss": 0.7044, "step": 4914 }, { "epoch": 0.991530258757372, "grad_norm": 0.6413541436195374, "learning_rate": 1.794870989486608e-09, "loss": 0.7415, "step": 4915 }, { "epoch": 0.9917319943135791, "grad_norm": 0.5035390853881836, "learning_rate": 1.708388972558317e-09, "loss": 0.8024, "step": 4916 }, { "epoch": 0.991933729869786, "grad_norm": 0.5376717448234558, "learning_rate": 1.6240418336993525e-09, "loss": 0.6732, "step": 4917 }, { "epoch": 0.992135465425993, "grad_norm": 0.6315147280693054, "learning_rate": 1.5418296089358964e-09, "loss": 0.6264, "step": 4918 }, { "epoch": 0.9923372009822, "grad_norm": 0.43847543001174927, "learning_rate": 1.4617523333820827e-09, "loss": 0.7628, "step": 4919 }, { "epoch": 0.992538936538407, "grad_norm": 0.465602844953537, "learning_rate": 1.3838100412416622e-09, "loss": 0.6496, "step": 4920 }, { "epoch": 0.992740672094614, "grad_norm": 0.43424639105796814, "learning_rate": 1.3080027658052275e-09, "loss": 0.818, "step": 4921 }, { "epoch": 0.992942407650821, "grad_norm": 0.46224871277809143, "learning_rate": 1.2343305394507677e-09, "loss": 0.6756, "step": 4922 }, { "epoch": 0.9931441432070279, "grad_norm": 0.8015269637107849, "learning_rate": 1.1627933936464442e-09, "loss": 0.7697, "step": 4923 }, { "epoch": 0.993345878763235, "grad_norm": 0.36045822501182556, "learning_rate": 1.0933913589461497e-09, "loss": 0.7097, "step": 4924 }, { "epoch": 0.9935476143194419, "grad_norm": 0.5318560600280762, "learning_rate": 1.0261244649945045e-09, "loss": 0.8177, "step": 4925 }, { "epoch": 0.9937493498756489, "grad_norm": 0.32715970277786255, "learning_rate": 9.60992740521305e-10, "loss": 0.6823, "step": 4926 }, { "epoch": 0.993951085431856, "grad_norm": 0.5524141192436218, "learning_rate": 8.979962133459641e-10, "loss": 0.6753, "step": 4927 }, { "epoch": 0.9941528209880629, "grad_norm": 0.5175045132637024, "learning_rate": 8.371349103764026e-10, "loss": 0.7542, "step": 4928 }, { "epoch": 0.99435455654427, "grad_norm": 0.42321813106536865, "learning_rate": 7.784088576068272e-10, "loss": 0.7561, "step": 4929 }, { "epoch": 0.9945562921004769, "grad_norm": 0.7016764283180237, "learning_rate": 7.218180801210617e-10, "loss": 1.0079, "step": 4930 }, { "epoch": 0.9947580276566839, "grad_norm": 0.3303213119506836, "learning_rate": 6.673626020903267e-10, "loss": 0.7609, "step": 4931 }, { "epoch": 0.9949597632128909, "grad_norm": 0.3997403681278229, "learning_rate": 6.150424467732397e-10, "loss": 0.7826, "step": 4932 }, { "epoch": 0.9951614987690979, "grad_norm": 0.5176541209220886, "learning_rate": 5.648576365169245e-10, "loss": 0.7491, "step": 4933 }, { "epoch": 0.9953632343253049, "grad_norm": 1.5391536951065063, "learning_rate": 5.168081927564572e-10, "loss": 0.6885, "step": 4934 }, { "epoch": 0.9955649698815119, "grad_norm": 0.4760850965976715, "learning_rate": 4.708941360148655e-10, "loss": 0.6359, "step": 4935 }, { "epoch": 0.9957667054377188, "grad_norm": 0.41108208894729614, "learning_rate": 4.2711548590368414e-10, "loss": 1.0169, "step": 4936 }, { "epoch": 0.9959684409939259, "grad_norm": 0.7306568622589111, "learning_rate": 3.854722611201789e-10, "loss": 0.6983, "step": 4937 }, { "epoch": 0.9961701765501328, "grad_norm": 2.0669448375701904, "learning_rate": 3.459644794523431e-10, "loss": 0.6308, "step": 4938 }, { "epoch": 0.9963719121063399, "grad_norm": 0.8391724824905396, "learning_rate": 3.0859215777445663e-10, "loss": 0.9293, "step": 4939 }, { "epoch": 0.9965736476625469, "grad_norm": 0.6950282454490662, "learning_rate": 2.7335531204930597e-10, "loss": 0.6589, "step": 4940 }, { "epoch": 0.9967753832187538, "grad_norm": 0.36784011125564575, "learning_rate": 2.4025395732651947e-10, "loss": 0.679, "step": 4941 }, { "epoch": 0.9969771187749609, "grad_norm": 0.3888319134712219, "learning_rate": 2.0928810774534237e-10, "loss": 0.6201, "step": 4942 }, { "epoch": 0.9971788543311678, "grad_norm": 0.3902921974658966, "learning_rate": 1.8045777653130648e-10, "loss": 0.6859, "step": 4943 }, { "epoch": 0.9973805898873748, "grad_norm": 2.439812421798706, "learning_rate": 1.5376297599845046e-10, "loss": 0.688, "step": 4944 }, { "epoch": 0.9975823254435818, "grad_norm": 0.39363893866539, "learning_rate": 1.2920371754931994e-10, "loss": 0.8237, "step": 4945 }, { "epoch": 0.9977840609997888, "grad_norm": 0.4545292258262634, "learning_rate": 1.0678001167274688e-10, "loss": 0.639, "step": 4946 }, { "epoch": 0.9979857965559958, "grad_norm": 0.307205468416214, "learning_rate": 8.649186794773556e-11, "loss": 0.649, "step": 4947 }, { "epoch": 0.9981875321122028, "grad_norm": 0.48131927847862244, "learning_rate": 6.833929503846648e-11, "loss": 0.6642, "step": 4948 }, { "epoch": 0.9983892676684097, "grad_norm": 0.4924654960632324, "learning_rate": 5.2322300698737225e-11, "loss": 0.6458, "step": 4949 }, { "epoch": 0.9985910032246168, "grad_norm": 0.4332221448421478, "learning_rate": 3.8440891769742085e-11, "loss": 0.7168, "step": 4950 }, { "epoch": 0.9987927387808238, "grad_norm": 0.5997542142868042, "learning_rate": 2.6695074181182225e-11, "loss": 0.6842, "step": 4951 }, { "epoch": 0.9989944743370308, "grad_norm": 0.7213303446769714, "learning_rate": 1.708485294904527e-11, "loss": 0.8125, "step": 4952 }, { "epoch": 0.9991962098932378, "grad_norm": 0.8712928891181946, "learning_rate": 9.610232178380862e-12, "loss": 0.6654, "step": 4953 }, { "epoch": 0.9993979454494447, "grad_norm": 0.37508413195610046, "learning_rate": 4.271215061635303e-12, "loss": 0.689, "step": 4954 }, { "epoch": 0.9995996810056518, "grad_norm": 0.3845532536506653, "learning_rate": 1.0678038792066857e-12, "loss": 0.6463, "step": 4955 }, { "epoch": 0.9998014165618587, "grad_norm": 0.40240800380706787, "learning_rate": 0.0, "loss": 0.6359, "step": 4956 }, { "epoch": 0.9998014165618587, "step": 4956, "total_flos": 6.502525695783076e+18, "train_loss": 0.7414010693292833, "train_runtime": 166490.8866, "train_samples_per_second": 7.622, "train_steps_per_second": 0.03 } ], "logging_steps": 1.0, "max_steps": 4956, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.502525695783076e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }